Description from Kaggle

In this competition the challenge is to predict the return of a stock, given the history of the past few days.

We provide 5-day windows of time, days D-2, D-1, D, D+1, and D+2. You are given returns in days D-2, D-1, and part of day D, and you are asked to predict the returns in the rest of day D, and in days D+1 and D+2.

During day D, there is intraday return data, which are the returns at different points in the day. We provide 180 minutes of data, from t=1 to t=180. In the training set you are given the full 180 minutes, in the test set just the first 120 minutes are provided.

For each 5-day window, we also provide 25 features, Feature_1 to Feature_25. These may or may not be useful in your prediction.

Each row in the dataset is an arbitrary stock at an arbitrary 5 day time window.

Outline

And apropos originally… the stuff below is already quite a clean-up :D

Below are the correlations between the past returns in the test data: test_past_returns

Now for the “outlier” training data: outlier_past_returns

And finally for the cleaned-up training data: clean_past_returns

some plotting functions

################################################################################################################
### some functions (sry - had some of them at hand)
f.yellowredblue <- function(x) {
  r <- approx(c(0, 0.5, 1), c(1, 1, 0), n = x)$y
  g <- approx(c(0, 0.5, 1), c(1, 0, 0), n = x)$y
  b <- approx(c(0, 0.5, 1), c(0, 0, 1), n = x)$y
  return(rgb(r, g, b))
}
f.blueredyellow <- function(x) {
  return(rev(f.yellowredblue(x)))
}
f.comp <- function(x,rt){
  if(rt==0){y <- x>0}
  else{y <- x>=rt}
  return(y)
}

f.generic.correlation.matrix <- function(data, rDir, outfile, corMethod = "pearson", useOnlyHighVar = FALSE, tryAutoColor = TRUE, highVarPercentage = 0.9, corUse = "everything") {
  require("gplots")
  if (useOnlyHighVar) {
    varVec <- apply(data, 1, var)
    lowerBound <- quantile(varVec, highVarPercentage)
    data <- data[varVec>=lowerBound,]
  }
  sampleCorMat <- cor(data, method = corMethod, use = corUse)
  # some plotting parameters
  minCor <- min(sampleCorMat, na.rm = TRUE)
  maxCor <- max(sampleCorMat[sampleCorMat != 1], na.rm = TRUE)
  if (sum(is.na(c(minCor, maxCor))) > 0) { tryAutoColor <- FALSE }
  if (tryAutoColor) {
    if (minCor > 0.8) { minCor <- 0.8 } else { if (minCor > 0.7) { minCor <- 0.7 } else { if (minCor > 0.5) { minCor <- 0.5 }}}
    if (maxCor > 0.8) { maxCor <- 1 }   else { if (maxCor < 0.8) { maxCor <- 0.8 } else { if (maxCor < 0.7) { maxCor <- 0.7 } else { if (maxCor < 0.5) { maxCor <- 0.5 } }}}
  }
  numCols <- ceiling((maxCor-minCor)*50)
  tiff(file.path(rDir, paste(outfile, "_cor_mat.tiff", sep = '')), width = 1600, height = 1600, compression = "lzw", type = "Xlib")
  labelCex <- 1 + 1/log10(nrow(sampleCorMat))
  noteCex <- 4
  if (nrow(sampleCorMat) > 6) { noteCex <- 3 }
  if (nrow(sampleCorMat) > 12) { noteCex <- 2 }
  if (nrow(sampleCorMat) > 18) { noteCex <- 1 }
  if (nrow(sampleCorMat) > 30) { #no notes
    heatmap.2(sampleCorMat, col = f.blueredyellow(numCols), trace="none", scale = "none", margins = c(15,15), breaks = seq(minCor, maxCor, length.out = numCols+1))
  } else {
    heatmap.2(sampleCorMat, col = f.blueredyellow(numCols), trace="none", scale = "none", margins = c(15,15), cellnote = round(sampleCorMat,2), notecol = "black", notecex = noteCex, breaks = seq(minCor, maxCor, length.out = numCols+1), cexRow = labelCex, cexCol = labelCex)
  }
  dev.off()
}

f.histogram <- function(x, y = c(), xName = "", useFreq = FALSE, doNotShowSummary = FALSE, useLog = FALSE, ...) 
{
  # this is a normal histogram mode (not bars, but frequency or density)
  # x: values
  # xName: x-axis label
  # useFreq: will plot frequencies instead of densities
  if (length(x) == length(y)) {
    toPlot <- list(x = x, y = y)
    class(toPlot) <- "density"
  } else {
    toPlot <- density(x)
  }
  if (useFreq) { 
    toPlot$y <- toPlot$y * length(x)
  }
  if (useLog) {
    if (useFreq) { toPlot$y <- log2(toPlot$y+1) }
    else { cat("log only possible for useFreq = TRUE\n") }
  }
  par(mar = c(5,4,1,1))
  plot(
    toPlot,
    #main = "",
    bty = "n",
    xaxs = "r",
    yaxs = "r",
    xlab = xName,
    ylab = "",
    las = 1,
    cex = 0.4,
    tck = 0.01,
    ...
  )
  if (!doNotShowSummary) {
    text(
      par("usr")[1] + par("usr")[2]/15,
      par("usr")[4] - par("usr")[4]/15,
      adj = c(0,1),
      labels = paste(c("median:", "mean:", "sd:"),
                     c(round(median(x), digits = 3),
                       round(mean(x), digits = 3),
                       round(sd(x), digits = 3)),
                     collapse = "\n", sep=" ")
    )
  }
}

f.scatter <- function(x, y, xName = "", yName = "", cexFactor = 1, ...)
{
  # this will draw a normal scatterplot
  # x: x values
  # y: y values
  # xName: x-axis label
  # yName: y-axis label
  par(mar = c(5,5,1,1))
  plot(
    y ~ x,
    #main = "",
    bty = "n",
    xaxs = "r",
    yaxs = "r",
    xlab = xName,
    ylab = yName,
    las = 1,
    cex = 0.2*cexFactor,
    tck = 0.01,
    pch = 19,
    ...
  )
  text(
    par("usr")[1] + par("usr")[2]/15,
    par("usr")[4] - par("usr")[4]/15,
    adj = c(0,1),
    labels = paste(c("corP:", "corS:", "n:"),
                   c(round(cor(x,y,method="pearson"), digits = 3),
                     round(cor(x,y,method="spearman"), digits = 3),
                     length(x)),
                   collapse = "\n", sep=" ")
  )
}

f.genetodensity <- function(x,y) {
  require("MASS")
  est <- kde2d(x, y, n = 50) # needs MASS - calculates the two dimensional density
  if (sum(is.na(est$z)) > 0) {
    est <- kde2d(x, y, n = 50, h = c(bw.nrd0(x), bw.nrd0(y))) # needs MASS - calculates the two dimensional density
  }
  dvec <- as.vector(est$z) # note: as.vector takes columnwise, rows correspond to the value of x, columns to the value of y - the y index counts therefore 50 times
  xind <- findInterval(x, est$x) # find the intervals to which a gene belongs
  yind <- findInterval(y, est$y) # find the intervals to which a gene belongs
  vecind <- (yind-1)*50 + xind # this is a vector with indices for dvec. 
  dens <- dvec[vecind]
  names(dens) <- names(x)
  return(dens)
}

f.genetodensitycolor <- function(x,y) {
  require("MASS")
  require("colorRamps")
  ## NOTE would not work if two spots have exactly the same density or?
  ## important for a nice picture is that one removes the grid points with no gene (the ones with the lowest density)
  grids <- 100
  out <- list()
  est <- kde2d(x, y, n = grids) # needs MASS - calculates the two dimensional density
  if (sum(is.na(est$z)) > 0) {
    est <- kde2d(x, y, n = grids, h = c(bw.nrd0(x), bw.nrd0(y))) # needs MASS - calculates the two dimensional density
  }
  dvec <- as.vector(est$z) # note: as.vector takes columnwise, rows correspond to the value of x, columns to the value of y - the y index counts therefore grids times
  xind <- findInterval(x, est$x) # find the intervals to which a gene belongs
  yind <- findInterval(y, est$y) # find the intervals to which a gene belongs
  vecind <- (yind-1)*grids + xind # this is a vector with indices for dvec. 
  dens <- dvec[vecind]
  dens_with_genes <- unique(dens)
  names(dens) <- names(x)
  out$dens <- dens
  out$denschar <- as.vector(dens, mode = "character")
  colgrad <- blue2red(length(dens_with_genes))
  #colgrad <- rainbow(length(dens_with_genes), start = 0, end = 1, alpha = 0.8)
  #colgrad <- heat.colors(length(dens_with_genes), alpha = 0.8)
  #colgrad <- terrain.colors(length(dens_with_genes), alpha = 0.8)
  #colgrad <- topo.colors(length(dens_with_genes), alpha = 0.8)
  #colgrad <- cm.colors(length(dens_with_genes), alpha = 0.8)
  sdens_with_genes <- sort(dens_with_genes, decreasing = FALSE)
  names(colgrad) <- sdens_with_genes
  out$cols <- colgrad
  genecols <- colgrad[as.vector(dens, mode = "character")]
  names(genecols) <- names(x)
  out$genecols <- genecols
  return(out)
}

f.replace.Inf.by.next <- function(x) {
  if (sum(abs(x) == Inf, na.rm = TRUE) == 0) {return(x)}
  cat("replacing Inf/-Inf with the next highest/lowest value\n")
  notNA <- !is.na(x)
  minInf <- x == -Inf
  pluInf <- x == Inf
  nx <- x[notNA&(!minInf)&(!pluInf)]
  x[notNA&pluInf] <- max(nx)
  x[notNA&minInf] <- min(nx)
  return(x)
}

f.density.scatter <- function(x, y, xName = "", yName = "", main = "", tryToScale = FALSE, ...) {
  x <- f.replace.Inf.by.next(x)
  y <- f.replace.Inf.by.next(y)
  if (tryToScale) {
    if (max(x) > 10*median(x)) {
      isExtreme <- x > 10*median(x)
      cat("limiting data to 10*median(x) - i.e. replacing", sum(isExtreme), "in percent:", mean(isExtreme)*100," extreme values\n")
      x[isExtreme] <- 10*median(x)
    }
    if (max(y) > 10*median(y)) {
      isExtreme <- y > 10*median(y)
      cat("limiting data to 10*median(y) - i.e. replacing", sum(isExtreme), "in percent:", mean(isExtreme)*100," extreme values\n")
      y[isExtreme] <- 10*median(y)
    }
  }
  #   xLimits <- range(x)
  #   if (max(x) > 10*median(x)) {
  #     cat("limiting plot to a quantile(x, 0.99)\n")
  #     xLimits[2] <- quantile(x, 0.99)
  #   }
  #   yLimits <- range(y)
  #   if (max(y) > 10*median(y)) {
  #     cat("limiting plot to a quantile(y, 0.99)\n")
  #     yLimits[2] <- quantile(y, 0.99)
  #   }
  colorGrad <- f.genetodensitycolor(x, y)
  # f.scatter(x, y, xName, yName, cexFactor = 1, col = colorGrad$genecols, main = main, xlim = xLimits, ylim = yLimits)
  f.scatter(x, y, xName, yName, cexFactor = 1, col = colorGrad$genecols, main = main, ...)
  return(NULL)
}

f.allinone <- function(data, rdir, rt = 0, prefix = "", useSVG = FALSE, onWin = FALSE, scaleToSample = FALSE) 
{
  # this function draws scatterplots, histograms and sorted scatters
  # x <dataframe/matrix>: values, one sample in one columns
  # rdir <directory>: folder for the figure
  # rt <number>: a threshold
  # prefix <string>: a word that will be added to the filename
  # useSVG <bool>: tells whether to draw an SVG or a TIFF
  # onWin <bool>: use this if you use Windows
  hits.union <- rownames(data)[which(apply(f.comp(data,rt),1,sum) > 0)]
  data <- data[hits.union,]
  # get sorted data and make sure everything is a data frame
  s.data <- apply(data,2,sort)
  data <- as.data.frame(data)
  s.data <- as.data.frame(s.data)
  # some variables
  ns = ncol(data)
  sn = colnames(data)
  # use CairoSVG if you are on windows
  afname <- paste(prefix,'_allinone_',rt,'_',paste(sn,collapse='_'),sep='')
  if (nchar(afname) > 50) { afname <- paste(prefix,'_allinone_',rt,'_many',sep='') }
  if (useSVG) {
    #print("use: rsvg-convert -a -d 300 -p 300 svgfile > pngfile")
    afpath <- file.path(rdir,paste(afname, '.svg', sep = ''))
    if (onWin) { CairoSVG(file = afpath) } 
    else { svg(filename = afpath, onefile = TRUE, bg = FALSE, antialias = "default") }
  } else {
    afpath <- file.path(rdir,paste(afname, '.tiff', sep = ''))
    tiff(file=afpath, width = 600*ns, height = 600*ns, units = "px", pointsize = 20, compression = "lzw", bg = "white", type = "Xlib")
  }
  # plot everything
  layout(mat=matrix(c(1:(ns*ns)), nrow=ns, byrow = TRUE))
  cat(paste("plotting", ns, "rows\n"))
  cat(paste(c("|", rep(' ', ns),"|\n|"),sep='',collapse=''))
  for (i in c(1:ns))
  {
    for (k in c(1:ns))
    {
      if (scaleToSample) {
        xMin <- floor(min(data[,k]))
        xMax <- ceiling(max(data[,k]))
        yMin <- floor(min(data[,i]))
        yMax <- ceiling(max(data[,i]))
      } else {
        xMin <- floor(min(data))
        xMax <- ceiling(max(data))
        yMin <- floor(min(data))
        yMax <- ceiling(max(data))
      }
      if (i == k) 
      {
        # histogram in the diagonal
        f.histogram(data[f.comp(data[,k],rt),k], xName = sn[k], xlim = c(xMin, xMax), main = "")
      }
      else if (i < k)
      { 
        # scatterplot with densities on the top right
        if ((length(unique(data[,k][!is.na(data[,k])])) > 2) & (length(unique(data[,i][!is.na(data[,i])])) > 2)) {
          colorGrad <- f.genetodensitycolor(data[,k], data[,i])
        } else {
          colorGrad <- list(genecols = rep("black", nrow(data)))
        }
        f.scatter(data[,k], data[,i], sn[k], sn[i], cexFactor = 1.5, col = colorGrad$genecols, xlim = c(xMin, xMax), ylim = c(yMin, yMax), main = "")
        abline(0,1,col="black")
      }
      else if (i > k)
      {
        # sorted scatterplot on the bottom left
        f.scatter(s.data[,k], s.data[,i], sn[k], sn[i], cexFactor = 1.5, xlim = c(xMin, xMax), ylim = c(yMin, yMax), main = "")
        abline(0,1,col="darkorchid3")
      }
    }
    cat("#")
  }
  cat('|\n')
  dev.off()
  if (useSVG) { 
    system(paste("rsvg-convert -a -d 300 -p 300 ", afpath," > ", afpath, ".png", sep = '')) 
    #system(paste("rm ", afpath, sep = ''))
  }
}

f.compare.ret.and.fea <- function(trainData, testData, ret, fea) {
  trainData$abs <- abs(trainData[[ret]])
  trainData$sig <- sign(trainData[[ret]])*rnorm(nrow(trainData), .5, .1)
  testData$abs <- abs(testData[[ret]])
  testData$sig <- sign(testData[[ret]])*rnorm(nrow(testData), .5, .1)
  trainData <- trainData[(!is.na(trainData[[ret]])) & (!is.na(trainData[[fea]])),]
  testData <- testData[(!is.na(testData[[ret]])) & (!is.na(testData[[fea]])),]
  layout(matrix(1:6, nrow = 2, byrow = TRUE))
  f.density.scatter(as.numeric(trainData[[fea]]), trainData[[ret]], fea, ret, "return")
  f.density.scatter(as.numeric(trainData[[fea]]), trainData$abs, fea, ret, "abs(return)") 
  f.density.scatter(as.numeric(trainData[[fea]]), trainData$sig, fea, ret, "sign(return)") 
  f.density.scatter(as.numeric(testData[[fea]]), testData[[ret]], fea, ret, "return")
  f.density.scatter(as.numeric(testData[[fea]]), testData$abs, fea, ret, "abs(return)") 
  f.density.scatter(as.numeric(testData[[fea]]), testData$sig, fea, ret, "sig(return)") 
  return(NULL)
}

f.compare.ret.and.fea.no.dens <- function(trainData, testData, ret, fea) {
  trainData$abs <- abs(trainData[[ret]])
  trainData$sig <- sign(trainData[[ret]])*rnorm(nrow(trainData), .5, .1)
  testData$abs <- abs(testData[[ret]])
  testData$sig <- sign(testData[[ret]])*rnorm(nrow(testData), .5, .1)
  trainData <- trainData[(!is.na(trainData[[ret]])) & (!is.na(trainData[[fea]])),]
  testData <- testData[(!is.na(testData[[ret]])) & (!is.na(testData[[fea]])),]
  layout(matrix(1:6, nrow = 2, byrow = TRUE))
  f.scatter(as.numeric(trainData[[fea]]), trainData[[ret]], fea, ret, cexFactor = 1, col = "black", main = "return")
  f.scatter(as.numeric(trainData[[fea]]), trainData$abs, fea, ret, cexFactor = 1, col = "black", main = "abs(return)") 
  f.scatter(as.numeric(trainData[[fea]]), trainData$sig, fea, ret, cexFactor = 1, col = "black", main = "sign(return)") 
  f.scatter(as.numeric(testData[[fea]]), testData[[ret]], fea, ret, cexFactor = 1, col = "black", main = "return")
  f.scatter(as.numeric(testData[[fea]]), testData$abs, fea, ret, cexFactor = 1, col = "black", main = "abs(return)") 
  f.scatter(as.numeric(testData[[fea]]), testData$sig, fea, ret, cexFactor = 1, col = "black", main = "sig(return)") 
  return(NULL)
}

some functions for imputations and so on

f.get.black.list <- function(train, test, featuresToCheck) {
  cat("checking features for difference between train and test\n")
  checks <- matrix(0, nrow = length(featuresToCheck), ncol = 3, dimnames = list(featuresToCheck, c("est", "pVal", "fdr")))
  for (thing in featuresToCheck) {
    temp <- t.test(as.numeric(train[[thing]]), as.numeric(test[[thing]]), na.action = na.omit)
    checks[thing, "est"] <- temp$statistic
    checks[thing, "pVal"] <- temp$p.value
    checks[thing, "fdr"] <- p.adjust(temp$p.value, method = "BH")
  }
  out <- rownames(checks)[checks[, "fdr"] < 0.01]
  print(checks[out,])
  return(out)
}

f.simple.imputation.with.nearest.neighbors <- function(data, nnq = 0.01, impFun = median) {
  require("cluster")
  dsm <- NULL
  nnm <- NULL
  dsmCount <- 0
  nnmCount <- 0
  while (sum(c(is.null(dsm), is.null(nnm))) > 0) { 
    if (is.null(dsm)) {
      dsmCount <- dsmCount + 1
      cat("calculating dissimilarity...", dsmCount, "\n")
      dsm <- as.matrix(daisy(data, "gower"))
    }
    if (is.null(nnm)) {
      nnmCount <- nnmCount + 1
      cat("searching nearest neighbors...", nnmCount, "\n")
      if (nnmCount > 1) { nnq <- nnq + 0.1*nnq }
      nnm <- t(apply(dsm, 1, function(x) which(x < quantile(x, nnq)))) # it includes itself, but actually no problem
    }
  }
  cat("replacing NAs\n")
  for (i in 1:nrow(data)) {
    if ((i %% 1000) == 0) { cat('#') }
    if ((i %% 10000) == 0) { cat('\n') }
    mask <- is.na(data[i,])
    if (sum(mask) == 0) { next }
    if (sum(mask) == 1) {
      data[i, mask] <- impFun(data[nnm[i,],mask], na.rm = TRUE)
    } else {
      data[i, mask] <- apply(data[nnm[i,],mask], 2, function(x) impFun(x, na.rm = TRUE))
    }
  }
  cat('\n')
  return(data)
}

f.simple.imputation.with.nearest.neighbors.sampled <- function(data, nnq = 0.01, impFun = mean, iter = 10, splitInto = 4, numCores = splitInto) {
  require("parallel")
  temp <- array(0, c(nrow(data), ncol(data), iter), list(rownames(data), colnames(data), paste0("iter_", 1:iter)))
  for (i in 1:iter) {
    splitBy <- sample.int(splitInto, nrow(data), replace = TRUE)
    subSets <- split(data, splitBy)
    subSetsImp <- mclapply(subSets, function(x) f.simple.imputation.with.nearest.neighbors(x, nnq, impFun), mc.cores = numCores)
    #subSetsImp <- do.call("rbind", subSetsImp)
    #rownames(subSetsImp) <- sapply(rownames(subSetsImp), function(x) unlist(strsplit(x, '.', TRUE))[2])
    for (ssi in subSetsImp) {
      tt <- as.matrix(ssi)
      temp[rownames(tt), colnames(tt), paste0("iter_", i)] <- tt
    }
  }
  out <- apply(temp, c(1,2), function(x) median(as.numeric(x), na.rm = TRUE))
  return(out)
}

f.simple.imputation.with.nearest.neighbors.sampled.VIM <- function(data, knn = 6, iter = 10, splitInto = 8, numCores = 4) {
  require("parallel")
  require("VIM")
  out <- array(0, c(nrow(data), ncol(data), iter), list(rownames(data), colnames(data), paste0("iter_", 1:iter)))
  for (i in 1:iter) {
    temp <- matrix(0, nrow(data), ncol(data), dimnames = list(rownames(data), colnames(data)))
    splitBy <- sample.int(splitInto, nrow(data), replace = TRUE)
    subSets <- split(data, splitBy)
    subSetsImp <- mclapply(subSets, function(x) kNN(x, k = knn, numFun = mean), mc.cores = numCores)
    for (ssi in subSetsImp) {
      for (cn in colnames(temp)) {
        temp[rownames(ssi), cn] <- ssi[, cn]
      }
    }
    out[,,i] <- temp
  }
  #out <- apply(out, c(1,2), function(x) median(as.numeric(x), na.rm = TRUE))
  return(out)
}

f.checkDupliBool <- function(x) {
  x <- x[!is.na(x)]
  temp <- table(x)
  if (sum(temp>1) != 0) {
    out <- TRUE
  } else {
    out <- FALSE
  }
}

f.checkDupli <- function(x) {
  x <- x[!is.na(x)]
  temp <- table(x)
  dupliCount <- sum(temp>1)
  return(dupliCount)
}

f.checkDupliVal <- function(x) {
  x <- x[!is.na(x)]
  temp <- table(x)
  dupliCount <- sum(temp>1)
  if (dupliCount == 0) { out <- NA}
  else if (dupliCount == 1) { out <- as.numeric(names(temp)[temp > 1])}
  else { out <- paste(names(temp)[temp > 1], collapse = '|') }
  return(out)
}

filter, normalization, and model functions

f.filter.handpicked <- function(data) {
  trainMask_D0 <- abs(data$Ret_D0) < 4e-4; trainMask_D0[is.na(trainMask_D0)] <- TRUE; print(sum(!trainMask_D0))
  trainMask_MO <- abs(data$Ret_MinusOne) < .2; trainMask_MO[is.na(trainMask_MO)] <- TRUE; print(sum(!trainMask_MO))
  trainMask_MT <- abs(data$Ret_MinusTwo) < .2; trainMask_MT[is.na(trainMask_MT)] <- TRUE; print(sum(!trainMask_MT))
  trainMask_F3 <- data$Feature_3 > -2; trainMask_F3[is.na(trainMask_F3)] <- TRUE; print(sum(!trainMask_F3))
  trainMask_F4 <- data$Feature_4 > -4; trainMask_F4[is.na(trainMask_F4)] <- TRUE; print(sum(!trainMask_F4))
  trainMask_F17 <- data$Feature_17 < 2; trainMask_F17[is.na(trainMask_F17)] <- TRUE; print(sum(!trainMask_F17))
  trainMask_F18 <- data$Feature_18 > -1.3; trainMask_F18[is.na(trainMask_F18)] <- TRUE; print(sum(!trainMask_F18))
  trainMask_F22 <- data$Feature_22 > -4.5; trainMask_F22[is.na(trainMask_F22)] <- TRUE; print(sum(!trainMask_F22))
  trainMask_F23 <- data$Feature_23 > -2; trainMask_F23[is.na(trainMask_F23)] <- TRUE; print(sum(!trainMask_F23))
  trainMask_F25 <- data$Feature_25 < 1; trainMask_F25[is.na(trainMask_F25)] <- TRUE; print(sum(!trainMask_F25))
  finalMask <- trainMask_D0 & trainMask_MO & trainMask_MT & trainMask_F3 & trainMask_F4 & trainMask_F22 & trainMask_F17 & trainMask_F18 & trainMask_F23 & trainMask_F25
  out <- data[finalMask,]
  return(out)
}

f.normalize.quantiles <- function(train, test, colsToNorm, quant = 1e-3) {
  for (colToNorm in colsToNorm) {
    trainQuants <- sort(quantile(train[[colToNorm]], seq(0, 1, quant), na.rm = TRUE)) # in some rare cases, two quantiles are almost equal but it complains about the sorting
    testQuants <- sort(quantile(test[[colToNorm]], seq(0, 1, quant), na.rm = TRUE))
    trainMap <- findInterval(train[[colToNorm]], trainQuants)
    testMap <- findInterval(test[[colToNorm]], testQuants)
    train[[colToNorm]] <- testQuants[trainMap]
    test[[colToNorm]] <- testQuants[testMap]
  }
  out <- list(train = train, test = test)
  return(out)
}

f.stahel.trafo <- function(x) {
  med <- median(x, na.rm = TRUE)
  qua <- quantile(x, 0.25, na.rm = TRUE)
  con <- med/((med/qua)^2.9)
  out <- log(x + con)
  return(out)
}

f.get.additional.variables <- function(data, blackList = c(), noInt = FALSE) {
  require("caTools")
  retCols <- grep("^Ret", colnames(data), value = TRUE)
  feaCols <- grep("^Feature", colnames(data), value = TRUE)
  weiCols <- grep("^Weight", colnames(data), value = TRUE)
  pastRet <- paste0("Ret_", c("MinusTwo", "MinusOne", 2:120))
  futuRet <- paste0("Ret_", c(121:180, "PlusOne", "PlusTwo"))
  originalCols <- colnames(data)
  returnsToSimplify <- paste0("Ret_", 2:120)
  if (length(blackList)) {
    returnsToSimplify <- setdiff(returnsToSimplify, blackList)
    feaCols <- setdiff(feaCols, blackList)
  }
  data$Ret_mean <- apply(data[,returnsToSimplify], 1, function(x) mean(x, na.rm = TRUE))
  data$Ret_median <- apply(data[,returnsToSimplify], 1, function(x) median(x, na.rm = TRUE))
  data$Ret_range <- apply(data[,returnsToSimplify], 1, function(x) max(x, na.rm = TRUE)-min(x, na.rm = TRUE))
  data$Ret_sumAbs <- apply(data[,returnsToSimplify], 1, function(x) sum(abs(x), na.rm = TRUE))
  data$Ret_var <- apply(data[,returnsToSimplify], 1, function(x) var(x, na.rm = TRUE))
  data$Ret_pos <- apply(data[,returnsToSimplify], 1, function(x) sum(x > 0, na.rm = TRUE))
  data$Ret_sum <- apply(data[,returnsToSimplify], 1, function(x) sum(x, na.rm = TRUE))
  data$Ret_sum_scaled <- data$Ret_sum/2*3
  data$Ret_past_mean <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) mean(x, na.rm = TRUE))
  data$Ret_past_sumAbs <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) sum(abs(x), na.rm = TRUE))
  data$Ret_past_range <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) max(x, na.rm = TRUE)-min(x, na.rm = TRUE))
  data$Ret_past_var <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) var(x, na.rm = TRUE))
  data$Ret_past_trend <- apply(data[,intersect(paste0("Ret_", 2:120), returnsToSimplify)], 1, function(x) sum(x, na.rm = TRUE))
  data$Ret_past_mean_trend <- apply(data[,intersect(paste0("Ret_", 2:120), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
  data$Ret_mean_trend_10 <- apply(data[,intersect(paste0("Ret_", 2:10), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
  data$Ret_mean_trend_15 <- apply(data[,intersect(paste0("Ret_", 6:15), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
  for (i in seq(20, 120, by = 5)) { 
    toSummarize <- intersect(paste0("Ret_", (i-10+1):i), returnsToSimplify)
    if (length(toSummarize) > 3) {
      data[[paste0("Ret_mean_trend_", i)]] <- apply(data[,], 1, function(x) mean(x, na.rm = TRUE))
    }
  }
#   if ("Ret_121" %in% colnames(data)) {
#     data$Ret_mean_trend <- apply(data[,paste0("Ret_", 121:180)], 1, function(x) mean(x, na.rm = TRUE))
#     data$Ret_mean_trend_130 <- apply(data[,paste0("Ret_", 121:130)], 1, function(x) mean(x, na.rm = TRUE))
#     data$Ret_mean_trend_135 <- apply(data[,paste0("Ret_", 126:135)], 1, function(x) mean(x, na.rm = TRUE))
#     for (i in seq(140, 180, by = 5)) { 
#       data[[paste0("Ret_past_mean_trend_", i)]] <- apply(data[,paste0("Ret_", (i-10+1):i)], 1, function(x) mean(x, na.rm = TRUE))
#     }
#   }
  # 
  simplifiedReturns <- setdiff(colnames(data), originalCols)
  # all feature interactions (pairwise multiplied)
  if (!noInt) {
    doubleInt <- combs(feaCols, 2)
    for (i in 1:nrow(doubleInt)) {
      curF <- doubleInt[i,]
      data[[paste0("int_", paste(curF, collapse = "X"))]] <- data[[curF[1]]]*data[[curF[2]]]
    }
  }
  # or add the triple feature interactions
#   tripleInt <- combs(feaCols, 3)
#   for (i in 1:nrow(tripleInt)) {
#     curF <- tripleInt[i,]
#     data[[paste(curF, collapse = "X")]] <- data[[curF[1]]]*data[[curF[2]]]*data[[curF[3]]]
#   }
  # and add the interactions of the features with all the past returns paste0(fea, "X", pastRet) >> ctrl-a-0 - also not too splendid (there are too many single past return points)
#   for (fea in feaCols) {
#     for (pas in pastRet) {
#       data[[paste0(fea, "X", pas)]] <- data[[fea]]*data[[pas]]
#     }
#   }
  # the same for the simplified returns >> ctrl-a-1 - not so great - how about only this
#   for (fea in feaCols) {
#     for (sim in simplifiedReturns) {
#       data[[paste0(fea, "X", sim)]] <- data[[fea]]*data[[sim]]
#     }
#   }
  
  out <- list(
    data = data, 
    simplifiedReturns = simplifiedReturns
  )
  if (!noInt) {
    out$dointCols = apply(doubleInt, 1, function(x) paste0("int_", paste(x, collapse = 'X')))
    # out$trintCols = apply(tripleInt, 1, function(x) paste(x, collapse = 'X')),
    # out$pasIntCols = paste0(rep(feaCols, each = length(pastRet)), "X", pastRet),
    # out$simIntCols = paste0(rep(feaCols, each = length(simplifiedReturns)), "X", simplifiedReturns)
  }
  return(out)
}

f.prepare.for.models <- function(train, test, type, doPreprocess = TRUE, toFactor = c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20"), addAnyway = c(), blackList = c(), noInt = FALSE, useQuantileNormalization = FALSE) {
  require("caret")
  #########################################################################################
  ### some things for data access
  retCols <- grep("^Ret", colnames(train), value = TRUE)
  feaCols <- grep("^Feature", colnames(train), value = TRUE)
  weiCols <- grep("^Weight", colnames(train), value = TRUE)
  pastRet <- paste0("Ret_", c("MinusTwo", "MinusOne", 2:120))
  futuRet <- paste0("Ret_", c(121:180, "PlusOne", "PlusTwo"))
  #########################################################################################
  ### calculate some additional variables
  toFactor <- setdiff(toFactor, blackList)
  if (type != "intra") {
    cat("calculating additional variables...\n")
    trainDataAndVars <- f.get.additional.variables(train, blackList, noInt)
    testDataAndVars <- f.get.additional.variables(test, blackList, noInt)
    train <- trainDataAndVars$data
    test <- testDataAndVars$data
  } else {
    if (!noInt) {
      trainDataAndVars <- f.get.additional.variables(train, blackList, noInt)
      testDataAndVars <- f.get.additional.variables(test, blackList, noInt)
      train <- trainDataAndVars$data
      test <- testDataAndVars$data
    }
  }
  #########################################################################################
  ### check for correlation with interactions and the regular features as well
  # intCols <- c(pastRet, trainDataAndVars$pasIntCols, trainDataAndVars$dointCols, trainDataAndVars$trintCols, trainDataAndVars$simplifiedReturns, trainDataAndVars$simIntCols)
  cat("check for correlation with target...\n")
  if (type == "inter") {targetVars <- c("Ret_PlusOne", "Ret_PlusTwo")}
  else if (type == "intra") {targetVars <- paste0("Ret_", 121:180)}
  else {targetVars <- type}
  if (type != "intra") { 
    intCols <- unique(c("Ret_MinusTwo", "Ret_MinusOne", trainDataAndVars$simplifiedReturns))
  } else {
    intCols <- c("Ret_MinusTwo", "Ret_MinusOne")
  }
  if (noInt) {
    cutOff <- .25
  } else {
    intCols <- unique(c(intCols, trainDataAndVars$dointCols))
    cutOff <- .75
  }
  intCors <- cor(train[,targetVars], train[, c(intCols, feaCols)], method = "spearman", use = "complete.obs")
  if (length(targetVars) == 1) {
    corSums <- intCors[1,]
  } else {
    corSums <- apply(abs(intCors[targetVars,]), 2, sum)
  }
  corSums <- corSums[setdiff(names(corSums), grep("Ret_[[:digit:]]{1,3}|_trend_", names(corSums), value = TRUE))]
  intVars <- names(corSums)[corSums > quantile(corSums, cutOff)]
  #########################################################################################
  ### convert the factors
  cat("convert factors...\n")
  for (ff in toFactor) {
    train[[ff]] <- as.factor(train[[ff]])
    test[[ff]] <- as.factor(test[[ff]])
  }
  #########################################################################################
  ### check for near zero variance and remove highly correlated features
  cat("check near-zero variance and remove highly correlated predictors...\n")
  nzv <- nearZeroVar(train[,intVars], saveMetrics= TRUE)
  temp <- train[,setdiff(intVars, toFactor)]
  descrCor <- cor(temp, use = "complete.obs")
  highlyCor <- colnames(temp)[findCorrelation(descrCor, cutoff = .75)]
  intVars <- setdiff(intVars, Reduce(union, list(rownames(nzv)[nzv$nzv], highlyCor)))
  intVars <- unique(c(intVars, addAnyway))
  #########################################################################################
  ### preprocess data (needs to be repeated for some vars in the intra day models)
  if (sum(apply(train, 2, function(x) sum(is.na(x)))) == 0) {
    if (doPreprocess) {
      cat("preprocessing data...\n")
#       comb <- rbind(train[, setdiff(intVars, toFactor)], test[, setdiff(intVars, toFactor)])
#       preProComb <- preProcess(comb, method = c("center", "scale", "YeoJohnson"))
#       tempComb <- predict(preProComb, comb)
      preproTrain <- preProcess(train[, setdiff(intVars, toFactor)], method = c("center", "scale", "YeoJohnson"))
      preproTest <- preProcess(test[, setdiff(intVars, toFactor)], method = c("center", "scale", "YeoJohnson"))
      tempTrain <- predict(preproTrain, train[, setdiff(intVars, toFactor)])
      tempTest <- predict(preproTest, test[, setdiff(intVars, toFactor)])
      for (cn in setdiff(intVars, toFactor)) { 
#         train[[cn]] <- tempComb[rownames(train), cn]
#         test[[cn]] <- tempComb[rownames(test), cn]
        train[[cn]] <- tempTrain[[cn]]
        test[[cn]] <- tempTest[[cn]]
      }
    }
  } else {
    cat("skip preprocessing data due to NAs...\n")
  }
  #########################################################################################
  ### preprocess with quantile normalization
  if (useQuantileNormalization) {
    if (doPreprocess) {stop("either use doPreprocess or useQuantileNormalization")}
    contVars <- setdiff(colnames(test), toFactor)
    preproData <- f.normalize.quantiles(train, test, contVars, quant = 0.0001)
    train <- preproData$train
    test <- preproData$test
  }
  #########################################################################################
  ### return
  out <- list(train = train, test = test, vars = intVars)
  return(out)
}

f.inter.day.predictions <- function(train, test, toPredict, predVars, modMet = "rpart", evaMet = "adaptive_cv") {
  #########################################################################################
#   cat("subsetting data...\n")
#   inTrain <- createDataPartition(y = train[[toPredict]], p = .5, list = FALSE)
#   trainSet <- train[inTrain,]
#   testSet <- train[-inTrain,]
  #########################################################################################
  cat("running model with", length(predVars), "variables...\n")
  predVars <- setdiff(predVars, c(toPredict))
  numReps <- ifelse(modMet == "rpart", 50, 10)
  myCtrl <- trainControl(method = evaMet, number = 10, repeats = numReps)
  if (modMet == "ranger") {
    formulaString <- paste0(toPredict, '~', paste(predVars, collapse = '+'))
    myFit <- train(formula(formulaString), train[,c(toPredict, predVars)], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
  } else {
    myFit <- train(x = train[,predVars], y = train[[toPredict]], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
  }
  predValsForScore <- predict(myFit, train)
  baseLine <- f.WMAE(train$Weight_Daily, train[[toPredict]], rep(0, nrow(train)))
  meanLine <- f.WMAE(train$Weight_Daily, train[[toPredict]], mean(train[[toPredict]], na.rm = TRUE))
  diffToNoPred <- f.WMAE(train$Weight_Daily, train[[toPredict]], predValsForScore) - baseLine
  diffToNoPredMean <- f.WMAE(train$Weight_Daily, train[[toPredict]], predValsForScore) - meanLine
  cat("Difference to zero prediction:", diffToNoPred, "... in percent:", diffToNoPred/baseLine*100, "\n")
  cat("Difference to mean prediction:", diffToNoPredMean, "... in percent:", diffToNoPredMean/meanLine*100, "\n")
  predValsForSubmission <- predict(myFit, test)
  #########################################################################################
  out <- list(mod = myFit, pred = predValsForSubmission, score = diffToNoPred/baseLine*100)
  return(out)
}

f.get.additional.variables.intra.test <- function(data, timePointToPredict, memory = 30, win = 10, blackList = c()) {
  validVars <- c(paste0("Ret_", 2:120), paste0("Ret_pred_", 121:180))
  if (length(blackList)) { validVars <- setdiff(validVars, blackList) }
  previous <- (timePointToPredict-memory):(timePointToPredict-1)
  previous <- intersect(c(paste0("Ret_", previous), paste0("Ret_pred_", previous)), validVars)
  # winNums <- 1:(length(previous)-win)
  winNums <- seq(1, length(previous)-win, by = ceiling(win/2))
  cat("calculating indicators...\n")
  for (i in winNums) {
    winToUse <- previous[i:(i+win)]
    data[[paste0("win_mean", i)]] <- apply(data[,winToUse], 1, function(x) mean(x, na.rm = TRUE))
    data[[paste0("win_sumAbs", i)]] <- apply(data[,winToUse], 1, function(x) sum(abs(x), na.rm = TRUE))
    data[[paste0("win_var", i)]] <- apply(data[,winToUse], 1, function(x) sum(abs(x), na.rm = TRUE))
  }
  return(data)
}

f.on.the.fly.intra.day.predictions <- function(train, test, timePointToPredict, predVars, memory = 30, win = 10, modMet = "rpart", evaMet = "adaptive_cv", blackList = c(), useQuantileNormalization = FALSE) {
  #########################################################################################
  ### calculate some more indicators (sliding windows)
  origPredVars <- predVars
  cat("calculating sliding windows...\n")
  train <- f.get.additional.variables.intra.test(train, timePointToPredict, memory, win, blackList)
  test <- f.get.additional.variables.intra.test(test, timePointToPredict, memory, win, blackList)
  winVars <- grep("^win_", colnames(train), value = TRUE)
  sdev <- apply(train[,winVars], 2, function(x) sd(x, na.rm = TRUE))
  winVars <- names(sdev)[sdev > 0]
  #########################################################################################
  ### some preprocessing of the new variables
  cat("preprocessing data...\n")
  if (useQuantileNormalization) {
    preproData <- f.normalize.quantiles(train, test, winVars, quant = 0.0001)
    train <- preproData$train
    test <- preproData$test
  } else {
    preproTrain <- preProcess(train[, winVars], method = c("center", "scale", "YeoJohnson"))
    preproTest <- preProcess(test[, winVars], method = c("center", "scale", "YeoJohnson"))
    train[,winVars] <- predict(preproTrain, train[, winVars])[,winVars]
    test[,winVars] <- predict(preproTest, test[, winVars])[,winVars]
  }
  #########################################################################################
  ### check for correlation with windows and prefilter
  origWinVars <- winVars
  cat(length(winVars), "windows with sd > 0\n")
  cors <- cor(train[,paste0("Ret_", timePointToPredict)], train[, winVars], method = "spearman")[1,]
  winVars <- names(cors)[cors > quantile(cors, .75, na.rm = TRUE)]
  #########################################################################################
  # cat("subsetting data...\n")
  toPredict <- paste0("Ret_", timePointToPredict)
  #inTrain <- createDataPartition(y = train[[toPredict]], p = .5, list = FALSE)
  #trainSet <- train[inTrain,]
  #testSet <- train[-inTrain,]
  #########################################################################################
  predVars <- unique(c(predVars, winVars))
  useAnyway <- paste0(ifelse(timePointToPredict == 121, "Ret_", "Ret_pred_"), timePointToPredict-1) # there is some autocorrelation here
  predVars <- unique(predVars, useAnyway)
  missing <- predVars[!(predVars %in% colnames(train))]
  if (length(missing)) {cat(missing, "is not in the data frame\n")}
  cat("running model with", length(predVars), "variables...\n")
  newVar <- paste0("Ret_pred_", timePointToPredict)
  numReps <- ifelse(modMet == "rpart", 50, 10)
  myCtrl <- trainControl(method = evaMet, number = 10, repeats = numReps)
  colsWithNA <- apply(train[,c(toPredict, predVars)], 2, function(x) sum(is.na(x)))
  cat(names(colsWithNA)[colsWithNA > 0], "\n")
  if (modMet == "ranger") {
    formulaString <- paste0(toPredict, '~', paste(predVars, collapse = '+'))
    cat(formulaString, "\n")
    myFit <- train(formula(formulaString), train[,c(toPredict, predVars)], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
  } else {
  myFit <- train(x = train[,predVars], y = train[[toPredict]], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
  }
  predValsForScore <- predict(myFit, train)
  baseLine <- f.WMAE(train$Weight_Intraday, train[[toPredict]], rep(0, nrow(train)))
  diffToNoPred <- f.WMAE(train$Weight_Intraday, train[[toPredict]], predValsForScore) - baseLine
  cat("Difference to zero prediction:", diffToNoPred, "... in percent:", diffToNoPred/baseLine*100, "\n")
  train[[newVar]] <- predict(myFit, train)
  test[[newVar]] <- predict(myFit, test)
  #########################################################################################
  ### NOTE: for security, remove the windows again
  train <- train[, setdiff(colnames(train), origWinVars)]
  test <- test[, setdiff(colnames(test), origWinVars)]
  out <- list(train = train, test = test, vars = origPredVars, mod = myFit, score = diffToNoPred/baseLine*100)
  return(out)
}

f.WMAE <- function(we, y, yHat) { sum(as.numeric(we)*abs(as.numeric(y)-as.numeric(yHat)))/length(we) }

the actual processing

#########################################################################################
### imputation - libraries
library("VIM")
library("mice")
library("randomForest")
library("cluster")
library("ranger")
library("dbscan")
library("rpart")
library("penalizedSVM")
library("mlr")
library("proxy")
library("party")
library("neuralnet")
library("brnn")
library("qrnn")
library("anfis")
library("frbs")
library("rPython")
library("caret")
library("rpart")
library("doMC")
registerDoMC(cores = 32)

#########################################################################################
### data reading and transformation
rDir <- "/home/ubuntu/Winton"
trainDataWithNA <- read.csv(file.path(rDir, "train.csv"), stringsAsFactors = FALSE, row.names = 1)
testDataWithNA <- read.csv(file.path(rDir, "test_2.csv"), stringsAsFactors = FALSE, row.names = 1)
rownames(trainDataWithNA) <- paste0("tr_", 1:nrow(trainDataWithNA))
rownames(testDataWithNA) <- paste0("te_", 1:nrow(testDataWithNA))
for (fea in paste0("Feature_", c(7,9,15))) {
  trainDataWithNA[[fea]] <- f.stahel.trafo(trainDataWithNA[[fea]])
  testDataWithNA[[fea]] <- f.stahel.trafo(testDataWithNA[[fea]])
}
commonCols <- intersect(colnames(trainDataWithNA), colnames(testDataWithNA))
comb <- rbind(trainDataWithNA[,commonCols], testDataWithNA[,commonCols])

#########################################################################################
### I noticed that the training data does not seem to fit to the test data:
# "good models" performed very poor on the test data, but good on the training data (cross-validation)
# several returns are significantly different in the training data compared to the test data (t-test)
# many approaches including the past returns and the features (data present in both, the train and the test data)
# failed miserably. In the end it looks like the future returns (no NAs as well) separate the training data
# nicely into two large groups. Take only the future returns for which the model fit was always good.
### try dbscan - here one could use the randomForest proximity?
futureReturns <- paste0("Ret_", c("PlusOne", "PlusTwo", 121:180))
kNNdistplot(trainDataWithNA[,futureReturns])
dbsCl <- dbscan(trainDataWithNA[,futureReturns], .02, length(futureReturns)+1)
table(dbsCl$cluster)
rowsToIgnore <- rownames(trainDataWithNA)[dbsCl$cluster != 1]
write.table(rowsToIgnore, file.path(rDir, "notInCenter_dbscan.txt"), row.names = FALSE, col.names = FALSE, quote = FALSE)

formulaString <- paste0('~', paste(futureReturns, collapse = '+'))
myPCA <- princomp(formula(formulaString), trainDataWithNA[, futureReturns], na.action = na.omit)
plot(myPCA)
trafoData <- as.matrix(trainDataWithNA[, futureReturns])%*%as.matrix(myPCA$loadings[,c("Comp.1", "Comp.2")])
numToCol <- c("black", "firebrick2", "dodgerblue2", "darkorchid2", "darkgoldenrod2")
plot(trafoData[,"Comp.1"], trafoData[,"Comp.2"], pch = 16, col = numToCol[dbsCl$cluster+1])
f.density.scatter(trafoData[,"Comp.1"], trafoData[,"Comp.2"])

#########################################################################################
### have a closer look at the dbscan result - they are really different
# after removing the "notInCenter" rows, Ret_PlusOne and Ret_PlusTwo have very good fits based on only feature_7
# but of course - online the submissions are really trashy
# really awkward is that the correlation between Ret_PlusTwo and Feature_7 is really good in the "notInCenter" rows
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRows <- setdiff(rownames(trainDataWithNA), toRemove)
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], trainDataWithNA[validRows, "Ret_PlusTwo"])
f.density.scatter(trainDataWithNA[toRemove, "Feature_7"], trainDataWithNA[toRemove, "Ret_PlusTwo"])
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], abs(trainDataWithNA[validRows, "Ret_PlusOne"]))
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], abs(trainDataWithNA[validRows, "Ret_PlusTwo"]))

# also interesting, the correlations between the past returns are really different between the two sets
f.generic.correlation.matrix(trainDataWithNA[validRows,feaCols], rDir, "isCenter", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,feaCols], rDir, "notCenter", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[validRows,pastRet], rDir, "isCenter_pastRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,pastRet], rDir, "notCenter_pastRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[validRows,retCols], rDir, "isCenter_pastAndFutRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,retCols], rDir, "notCenter_pastAndFutRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(testDataWithNA[,pastRet], rDir, "test_pastRet", "spearman", corUse = "pairwise.complete.obs")

# this here did not change too much though -.- solve with normalization 
blackListBefore <- f.get.black.list(trainDataWithNA, testDataWithNA, colnames(testDataWithNA))
blackListAfter <- f.get.black.list(trainDataWithNA[setdiff(rownames(trainDataWithNA), toRemove),], testDataWithNA, colnames(testDataWithNA))
print(length(intersect(blackListBefore, blackListAfter))/length(union(blackListBefore, blackListAfter)))

#########################################################################################
### compare features and some past returns between train and test data - eventually use quantile normalization?
tempTrain <- trainDataWithNA[validRows,]
tempTest <- testDataWithNA
tempTrain$Ret_D0 <- apply(tempTrain[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
tempTest$Ret_D0 <- apply(tempTest[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
  tiff(file.path(rDir, "train_vs_test", paste0("train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
  testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
  trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
  f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
  abline(a = 0, b = 1, col = "red")
  if (toCheck %in% paste0("Ret_", 2:120)) {
    abline(h = 0.008, v=  0.008, col = "green")
    abline(h = -0.008, v = -0.008, col = "green")
  }
  dev.off()
}
# categorial: 1, 5, 9, 10, 13, 20
# normal: 2, 3, 6, 7, 15, 21, 24
# extreme case but fitting: 4
# single outlier: 11, 19, 22
# bent end: 14
# knee/bump: 17, 18, 23, 25
# not ok at all: 8, 16 (factors with few levels)
# D0: abs() < 4e-4
# Ret_MinusOne/Two: abs() < 0.02
tempTrain <- f.filter.handpicked(tempTrain); nrow(tempTrain)
tempTest <- f.filter.handpicked(tempTest); nrow(tempTest)
tempTrain <- tempTrain[apply(tempTrain[,paste0("Ret_", 2:120)], 1, function(x) sum(abs(x) > .006, na.rm = TRUE)) == 0, ]; nrow(tempTrain)
tempTest <- tempTest[apply(tempTest[,paste0("Ret_", 2:120)], 1, function(x) sum(abs(x) > .006, na.rm = TRUE)) == 0, ]; nrow(tempTest)
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
  tiff(file.path(rDir, "train_vs_test", paste0("filtered_train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
  testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
  trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
  f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
  abline(a = 0, b = 1, col = "red")
  dev.off()
}

validRowsTrain <- rownames(tempTrain)
validRowsTest <- rownames(tempTest)
write.table(validRowsTrain, file.path(rDir, "validRowsTrain.txt"), quote = FALSE, col.names = FALSE, row.names = FALSE) # 33103 / 32341
write.table(validRowsTest, file.path(rDir, "validRowsTest.txt"), quote = FALSE, col.names = FALSE, row.names = FALSE) # 101881 / 98764

# preproTrain <- preProcess(tempTrain[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))], method = c("center", "scale", "YeoJohnson"))
# preproTest <- preProcess(tempTest[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))], method = c("center", "scale", "YeoJohnson"))
# tempTrain <- predict(preproTrain, tempTrain[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))])
# tempTest <- predict(preproTest, tempTest[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))])
contVars <- c("Ret_MinusOne", "Ret_MinusTwo", paste0("Ret_", 2:120), paste0("Feature_", c(2:4,6,7,11,12,14,15,17:19,21:25)))
preproData <- f.normalize.quantiles(tempTrain, tempTest, contVars, quant = 0.01)
tempTrain <- preproData$train
tempTest <- preproData$test
tempTrain$Ret_D0 <- apply(tempTrain[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
tempTest$Ret_D0 <- apply(tempTest[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
  tiff(file.path(rDir, "train_vs_test", paste0("filtered_prePro_train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
  testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .01), na.rm = TRUE)
  trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .01), na.rm = TRUE)
  f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
  abline(a = 0, b = 1, col = "red")
  dev.off()
}

#########################################################################################
### again an imputation - only with the "validRowsTrain" and "validRowsTest"
# MICE (5 times 10 imputations)
offset <- 0
offset <- 10
offset <- 20
offset <- 30
offset <- 40
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRowsTrain <- scan(file.path(rDir, "validRowsTrain.txt"), what = "character")
validRowsTest <- scan(file.path(rDir, "validRowsTest.txt"), what = "character")
invalidRowsTrain <- setdiff(paste0("tr_", 1:4e4), validRowsTrain); length(invalidRowsTrain)
invalidRowsTest <- setdiff(paste0("te_", 1:12e4), validRowsTest); length(invalidRowsTest)
invalidRowsTrain <- setdiff(invalidRowsTrain, toRemove); length(invalidRowsTrain)
validRows <- union(validRowsTrain, validRowsTest); length(validRows)
invalidRows <- union(invalidRowsTrain, invalidRowsTest); length(invalidRows)
for (toFactor in c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")) { comb[[toFactor]] <- as.factor(comb[[toFactor]]) }
combValid <- comb[validRows,]; nrow(combValid)
combImpAllValid <- mice(combValid, m = 10, seed = 123)
for (i in 1:10) {
  temp <- complete(combImpAllValid, i)
  write.csv(temp, file.path(rDir, "mice_final", paste0("VALID_all_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
  write.csv(temp[validRowsTest,], file.path(rDir, "mice_final", paste0("VALID_test_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
  trainDataWithNA[validRowsTrain, colnames(temp)] <- temp[validRowsTrain,]
  write.csv(trainDataWithNA[validRowsTrain,], file.path(rDir, "mice_final", paste0("VALID_train_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
}
combInvalid <- comb[invalidRows,]; nrow(combInvalid)
combImpAllInvalid <- mice(combInvalid, m = 10, seed = 123)
for (i in 1:10) {
  temp <- complete(combImpAllInvalid, i)
  write.csv(temp, file.path(rDir, "mice_final", paste0("INVALID_all_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
  write.csv(temp[invalidRowsTest,], file.path(rDir, "mice_final", paste0("INVALID_test_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
  trainDataWithNA[invalidRowsTrain,colnames(temp)] <- temp[invalidRowsTrain,]
  write.csv(trainDataWithNA[invalidRowsTrain,], file.path(rDir, "mice_final", paste0("INVALID_train_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
}

# get also an average imputation from all the mice runs - VALID and INVALID data
library("VIM") # for maxCat
toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")
for (miceType in c("INVALID")) {
  trainData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_train_mice_imp_all_", 1,".csv")), stringsAsFactors = FALSE, row.names = 1)
  testData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_test_mice_imp_all_", 1,".csv")), stringsAsFactors = FALSE, row.names = 1)
  for (cn in toFactor) { 
    trainData[[cn]] <- as.factor(trainData[[cn]])
    testData[[cn]] <- as.factor(testData[[cn]])
  }
  tempTrain <- array(0, c(nrow(trainData), ncol(trainData), 50), list(rownames(trainData), colnames(trainData), paste0("iter_", 1:50)))
  tempTest <- array(0, c(nrow(testData), ncol(testData), 50), list(rownames(testData), colnames(testData), paste0("iter_", 1:50)))
  for (cn in colnames(trainData)) { tempTrain[rownames(trainData), cn, "iter_1"] <- trainData[[cn]] }
  for (cn in colnames(testData)) { tempTest[rownames(testData), cn, "iter_1"] <- testData[[cn]] }
  for (impRun in 2:50) {
    cat("processing imputation", impRun, "...\n")
    testData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_test_mice_imp_all_", impRun,".csv")), stringsAsFactors = FALSE, row.names = 1)
    trainData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_train_mice_imp_all_", impRun,".csv")), stringsAsFactors = FALSE, row.names = 1)
    for (cn in toFactor) { 
      trainData[[cn]] <- as.factor(trainData[[cn]])
      testData[[cn]] <- as.factor(testData[[cn]])
    }
  for (cn in colnames(trainData)) { tempTrain[rownames(trainData), cn, paste0("iter_", impRun)] <- trainData[[cn]] }
  for (cn in colnames(testData)) { tempTest[rownames(testData), cn, paste0("iter_", impRun)] <- testData[[cn]] }
  }
  quantCols <- setdiff(colnames(testData),toFactor)
  trainData[,toFactor] <- apply(tempTrain[,toFactor,], c(1,2), function(x) maxCat(x))
  trainData[,quantCols] <- apply(tempTrain[,quantCols,], c(1,2), function(x) median(as.numeric(x)))
  testData[,toFactor] <- apply(tempTest[,toFactor,], c(1,2), function(x) maxCat(x))
  testData[,quantCols] <- apply(tempTest[,quantCols,], c(1,2), function(x) median(as.numeric(x)))
  write.csv(testData, file.path(rDir, paste0(miceType, "_test_mice_imp_all_mean.csv")), quote = FALSE)
  write.csv(trainData, file.path(rDir, paste0(miceType, "_train_mice_imp_all_mean.csv")), quote = FALSE)
}

#########################################################################################
### again an imputation - only with the "validRowsTrain" and "validRowsTest"
# kNN with VIM (should solve the categorial problem)
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRowsTrain <- scan(file.path(rDir, "validRowsTrain.txt"), what = "character")
validRowsTest <- scan(file.path(rDir, "validRowsTest.txt"), what = "character")
invalidRowsTrain <- setdiff(paste0("tr_", 1:4e4), validRowsTrain); length(invalidRowsTrain)
invalidRowsTest <- setdiff(paste0("te_", 1:12e4), validRowsTest); length(invalidRowsTest)
invalidRowsTrain <- setdiff(invalidRowsTrain, toRemove); length(invalidRowsTrain)
validRows <- union(validRowsTrain, validRowsTest); length(validRows)
invalidRows <- union(invalidRowsTrain, invalidRowsTest); length(invalidRows)
toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")
for (cn in ) { comb[[cn]] <- as.factor(comb[[cn]]) }
combValid <- comb[validRows,]; nrow(combValid)
combInvalid <- comb[invalidRows,]; nrow(combInvalid)
knnValidIter <- f.simple.imputation.with.nearest.neighbors.sampled.VIM(combValid, splitInto = 8, numCores = 8)
knnValid <- combValid
knnValid[,toFactor] <- apply(knnValidIter[,toFactor,], c(1,2), function(x) maxCat(x))
knnValid[,setdiff(colnames(knnValid),toFactor)] <- apply(knnValidIter[,setdiff(colnames(knnValid),toFactor),], c(1,2), function(x) median(x))
knnInvalidIter <- f.simple.imputation.with.nearest.neighbors.sampled.VIM(combInvalid, splitInto = 8, numCores = 8)
knnInvalid <- combInvalid
knnInvalid[,toFactor] <- apply(knnInvalidIter[,toFactor,], c(1,2), function(x) maxCat(x))
knnInvalid[,setdiff(colnames(knnInvalid),toFactor)] <- apply(knnInvalidIter[,setdiff(colnames(knnInvalid),toFactor),], c(1,2), function(x) median(x))
for (cn in colnames(knnInvalid)) {
  cat("processing", cn, "...\n")
  trainDataWithNA[validRowsTrain, cn] <- knnValid[validRowsTrain, cn]
  testDataWithNA[validRowsTest, cn] <- knnValid[validRowsTest, cn]
  trainDataWithNA[invalidRowsTrain, cn] <- knnInvalid[invalidRowsTrain, cn]
  testDataWithNA[invalidRowsTest, cn] <- knnInvalid[invalidRowsTest, cn]
}
write.csv(trainDataWithNA[setdiff(rownames(trainDataWithNA), toRemove), ], file.path(rDir, "BOTH_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA, file.path(rDir, "BOTH_kNN_test.csv"), quote = FALSE)
write.csv(trainDataWithNA[validRowsTrain, ], file.path(rDir, "VALID_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA[validRowsTest, ], file.path(rDir, "VALID_kNN_test.csv"), quote = FALSE)
write.csv(trainDataWithNA[invalidRowsTrain, ], file.path(rDir, "INVALID_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA[invalidRowsTest, ], file.path(rDir, "INVALID_kNN_test.csv"), quote = FALSE)

#########################################################################################
### load data (check if there are still some NAs left - no)
rDir <- "/Users/marc/Documents/winton"
rDir <- "/home/ubuntu/Winton"
impType <- "INVALID"
trainDataMICE <- read.csv(file.path(rDir, paste0(impType, "_train_mice_imp_all_mean.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(trainDataMICE)
testDataMICE <- read.csv(file.path(rDir, paste0(impType, "_test_mice_imp_all_mean.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(testDataMICE)
trainDataKNN <- read.csv(file.path(rDir, paste0(impType, "_kNN_train.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(trainDataKNN)
testDataKNN <- read.csv(file.path(rDir, paste0(impType, "_kNN_test.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(testDataKNN)
sum(apply(trainDataMICE, 1, function(x) sum(is.na(x)) > 0))
sum(apply(testDataMICE, 1, function(x) sum(is.na(x)) > 0))
sum(apply(trainDataKNN, 1, function(x) sum(is.na(x)) > 0))
sum(apply(testDataKNN, 1, function(x) sum(is.na(x)) > 0))
# toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")

#########################################################################################
### check scatterplots by eye
contVars <- setdiff(colnames(testDataMICE), toFactor)
preproDataMICE <- f.normalize.quantiles(trainDataMICE, testDataMICE, contVars, quant = 0.0001)
preproDataKNN <- f.normalize.quantiles(trainDataKNN, testDataKNN, contVars, quant = 0.0001)
for (ret in c("Ret_PlusOne", "Ret_PlusTwo")) {
  for (fea in paste0("Feature_", 10:25)) {
    tiff(file.path(rDir, "ret_vs_fea", paste0("validRows_raw_mice_knn_", ret, "_vs_", fea, ".tiff")), type = "Xlib", compression = "lzw", width = 1500, height = 1000, pointsize = 20)
    if (fea %in% paste0("Feature_", c(10, 16))) {
      f.compare.ret.and.fea.no.dens(trainDataMICE, trainDataKNN, ret, fea)
    } else{
      f.compare.ret.and.fea(trainDataMICE, trainDataKNN, ret, fea)
    }
    dev.off()
    tiff(file.path(rDir, "ret_vs_fea", paste0("validRows_quantNorm_mice_knn_", ret, "_vs_", fea, ".tiff")), type = "Xlib", compression = "lzw", width = 1500, height = 1000, pointsize = 20)
    if (fea %in% paste0("Feature_", c(10, 16))) {
      f.compare.ret.and.fea.no.dens(preproDataMICE$train, preproDataKNN$train, ret, fea)
    } else{
      f.compare.ret.and.fea(preproDataMICE$train, preproDataKNN$train, ret, fea)
    }
    dev.off()
  }
}

#########################################################################################
### predictions - in total we have now:
# data sets: VALID and INVALID
# imputations: MICE (take mean) and kNN
# normalizaiton: preProcess() and f.quantile.normalization()
# methods: cubist, ranger, rpart
# this will all run on different VMs...
jobs <- list()
jobs$MICE_preProcess_cubist <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "cubist", evalMet = "boot")
jobs$MICE_preProcess_randomForest <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "rf", evalMet = "boot")
jobs$MICE_preProcess_rpart <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "rpart", evalMet = "repeatedcv")
jobs$MICE_quantNorm_cubist <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "cubist", evalMet = "boot")
jobs$MICE_quantNorm_randomForest <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "rf", evalMet = "boot")
jobs$MICE_quantNorm_rpart <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "rpart", evalMet = "repeatedcv")
jobs$KNN_preProcess_cubist <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "cubist", evalMet = "boot")
jobs$KNN_preProcess_randomForest <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "rf", evalMet = "boot")
jobs$KNN_preProcess_rpart <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "rpart", evalMet = "repeatedcv")
jobs$KNN_quantNorm_cubist <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "cubist", evalMet = "boot")
jobs$KNN_quantNorm_randomForest <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "rf", evalMet = "boot")
jobs$KNN_quantNorm_rpart <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "rpart", evalMet = "repeatedcv")

nrow(trainDataMICE)
nrow(testDataMICE)
nrow(trainDataKNN)
nrow(testDataKNN)

#########################################################################################
### predictions
jobList <- c("MICE_preProcess_cubist")
jobList <- c("MICE_preProcess_randomForest")
jobList <- c("MICE_quantNorm_cubist")
jobList <- c("MICE_quantNorm_randomForest")
jobList <- c("KNN_preProcess_cubist")
jobList <- c("KNN_preProcess_randomForest")
jobList <- c("KNN_quantNorm_cubist")
jobList <- c("KNN_quantNorm_randomForest")
jobList <- grep("^MICE_preProcess", names(jobs), value = TRUE)[1:2]
jobList <- grep("^MICE_quantNorm", names(jobs), value = TRUE)[1:2]
jobList <- grep("^KNN_preProcess_", names(jobs), value = TRUE)[1:2]
jobList <- grep("^KNN_quantNorm_", names(jobs), value = TRUE)[1:2]
jobList <- grep("rpart$", names(jobs), value = TRUE)

for (currentJob in jobList) {
  trainData <- jobs[[currentJob]]$train
  testData <- jobs[[currentJob]]$test
  doPrep <- jobs[[currentJob]]$doPreprocess
  useQua <- jobs[[currentJob]]$useQuantileNormalization
  modMet <- jobs[[currentJob]]$modMet
  evaMet <- jobs[[currentJob]]$evalMet
  toFact <- paste0("Feature_", c(1,5,10,13,16,20))
  cat("doing interday predictions...\n")    
  preparedData <- f.prepare.for.models(trainData, testData, "inter", doPreprocess = doPrep, toFactor = toFact, noInt = TRUE, useQuantileNormalization = useQua)
  scoresInter <- c()
  predictions <- list()
  for (toPredict in c("Ret_PlusOne", "Ret_PlusTwo")) {
    cat("doing", currentJob, toPredict, "...\n")
    tempRes <- f.inter.day.predictions(preparedData$train, preparedData$test, toPredict, preparedData$vars, modMet, evaMet)
    predictions[[toPredict]] <- tempRes$pred
    scoresInter <- c(scoresInter, tempRes$score)
  }
  out <- do.call("cbind", predictions); colnames(out) <- c("Ret_PlusOne", "Ret_PlusTwo"); rownames(out) <- rownames(testData)
  write.table(scoresInter, file.path(rDir, paste0("scores_interday_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
  write.csv(out, file.path(rDir, paste0("interday_", impType, "_", currentJob, ".csv")), quote = FALSE)
  tiff(file.path(rDir, paste0("predCheck_", impType, "_", currentJob, ".tiff")), width = 600, height = 300)
  layout(matrix(1:2, nrow = 1))
  f.histogram(predictions$Ret_PlusOne)
  f.histogram(predictions$Ret_PlusTwo)
  dev.off()
  cat("doing intraday predictions...\n")    
  preparedData <- f.prepare.for.models(trainData, testData, "intra", doPreprocess = doPrep, toFactor = toFact, noInt = TRUE, useQuantileNormalization = useQua)
  scoresIntra <- c()
  for (timePointToPredict in 121:180) {
    cat("doing", currentJob, paste0("Ret_pred_", timePointToPredict), "...\n")
    preparedData <- f.on.the.fly.intra.day.predictions(preparedData$train, preparedData$test, timePointToPredict, preparedData$vars, memory = 30, win = 6, 
                                                       modMet = modMet, evaMet = evaMet, useQuantileNormalization = useQua)
    scoresIntra <- c(scoresIntra, preparedData$score)
    write.table(preparedData$test[, paste0("Ret_pred_", timePointToPredict)], file.path(rDir, paste0("Ret_pred_", timePointToPredict, "_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
  }
  write.table(scoresIntra, file.path(rDir, paste0("scores_intraday_", timePointToPredict, "_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
  write.csv(preparedData$test[, grep("^Ret_pred", colnames(preparedData$test), value = TRUE)], file.path(rDir, paste0("intraday_", impType, "_", currentJob, ".csv")), quote = FALSE)
}

#########################################################################################
### load the predictions and merge them
impType <- "VALID"
jobList <- c("KNN_quantNorm_cubist")
predictions <- list()
scores <- list()
for (dataSet in jobList) {
  predictions[[dataSet]] <- matrix(0, nrow = 12e4, ncol = 62, dimnames = list(paste0("te_", 1:12e4), c("Ret_PlusOne", "Ret_PlusTwo", paste0("Ret_pred_", 121:180))))
  for (impType in c("VALID", "INVALID")) {
    for (prefix in c("interday", "intraday")) {
      temp <- read.csv(file.path(rDir, "final_predictions", paste0(prefix, "_", impType, "_", dataSet, ".csv")), row.names = 1)
      for (cn in colnames(temp)) {
        predictions[[dataSet]][rownames(temp), cn] <- temp[,cn]
      }
      scores <- as.numeric(scan(file.path(rDir, "final_predictions", paste0("scores_", prefix, "_", impType, "_", dataSet, ".txt")), what = "double"))
      if (prefix == "intraday") {
        cat("discarding", sum(scores > 0), "bad fits\n")
        predictions[[dataSet]][rownames(temp), which(scores > 0)] <- 0
        cat("discarding", sum(scores < (-.05)), "too good fits\n")
        predictions[[dataSet]][rownames(temp), which(scores < (-.05))] <- 0
      }
    }
  }
}

for (dataSet in jobList) {
  out <- data.frame(
    Id = paste0(rep(1:12e4, each = 62), '_', 1:62),
    Predicted = 0,
    stringsAsFactors = FALSE
  )
  rownames(out) <- out$Id
  for (id in 1:60) {
    ret <- paste0("Ret_pred_", 120+id)
    out[paste0(1:12e4, '_', id), "Predicted"] <- predictions[[dataSet]][,ret]
  }
  out[paste0(1:12e4, '_61'),"Predicted"] <- predictions[[dataSet]][,"Ret_PlusOne"]
  out[paste0(1:12e4, '_62'),"Predicted"] <- predictions[[dataSet]][,"Ret_PlusTwo"]
  write.csv(out, file.path(rDir, "final_predictions", paste0("FIN_", dataSet, "_onlyIntra.csv")), quote = FALSE, row.names = FALSE)
}