The Winton Stock Market Challenge

Description from Kaggle

In this competition the challenge is to predict the return of a stock, given the history of the past few days.

We provide 5-day windows of time, days D-2, D-1, D, D+1, and D+2. You are given returns in days D-2, D-1, and part of day D, and you are asked to predict the returns in the rest of day D, and in days D+1 and D+2.

During day D, there is intraday return data, which are the returns at different points in the day. We provide 180 minutes of data, from t=1 to t=180. In the training set you are given the full 180 minutes, in the test set just the first 120 minutes are provided.

For each 5-day window, we also provide 25 features, Feature_1 to Feature_25. These may or may not be useful in your prediction.

Each row in the dataset is an arbitrary stock at an arbitrary 5 day time window.

Outline

log(x+c)-transform features (7, 9, 15)
treat feature 1, 5, 10, 13, 16, and 20 as factor (for imputations and models)
check train and test data for “obvious outliers” (see figures below - identified with dbscan on future returns - plotted are the past returns to check whether the cleaned-up train data looks similar to the test data)
check features and filter manually (split into two separate data sets)
impute NAs (with MICE and KNN)
for interday-returns: calculate some additional variables (e.g. absolute sum, range, variation of the past returns (only MinusTwo, MinusOne, and summarized returns from day zero))
for a given intraday-return: calculate some smoothened variables from the previous X timepoints
use features (tried also once with double and triple interactions) as well
normalize all the features (either center, scale, and YeoJohnson-transformation or custom quantile normalization)
pre-select variables based on spearman correlation with the returns of interest
originally I used random forests and a custom “cross-validation function” - in the end I used caret
conclusion: I likely messed up the part with the fitting/cross validation procedure - I got really good fits with for example rpart, ranger/randomForest, and cubist, but on the test data, they all failed. Sadly there wasn’t enough time in the end to fix this.

And apropos originally… the stuff below is already quite a clean-up :D

Below are the correlations between the past returns in the test data: test_past_returns

Now for the “outlier” training data: outlier_past_returns

And finally for the cleaned-up training data: clean_past_returns

some plotting functions

################################################################################################################
### some functions (sry - had some of them at hand)
f.yellowredblue <- function(x) {
  r <- approx(c(0, 0.5, 1), c(1, 1, 0), n = x)$y
  g <- approx(c(0, 0.5, 1), c(1, 0, 0), n = x)$y
  b <- approx(c(0, 0.5, 1), c(0, 0, 1), n = x)$y
  return(rgb(r, g, b))
}
f.blueredyellow <- function(x) {
  return(rev(f.yellowredblue(x)))
}
f.comp <- function(x,rt){
  if(rt==0){y <- x>0}
  else{y <- x>=rt}
  return(y)
}

f.generic.correlation.matrix <- function(data, rDir, outfile, corMethod = "pearson", useOnlyHighVar = FALSE, tryAutoColor = TRUE, highVarPercentage = 0.9, corUse = "everything") {
  require("gplots")
  if (useOnlyHighVar) {
    varVec <- apply(data, 1, var)
    lowerBound <- quantile(varVec, highVarPercentage)
    data <- data[varVec>=lowerBound,]
  }
  sampleCorMat <- cor(data, method = corMethod, use = corUse)
  # some plotting parameters
  minCor <- min(sampleCorMat, na.rm = TRUE)
  maxCor <- max(sampleCorMat[sampleCorMat != 1], na.rm = TRUE)
  if (sum(is.na(c(minCor, maxCor))) > 0) { tryAutoColor <- FALSE }
  if (tryAutoColor) {
    if (minCor > 0.8) { minCor <- 0.8 } else { if (minCor > 0.7) { minCor <- 0.7 } else { if (minCor > 0.5) { minCor <- 0.5 }}}
    if (maxCor > 0.8) { maxCor <- 1 }   else { if (maxCor < 0.8) { maxCor <- 0.8 } else { if (maxCor < 0.7) { maxCor <- 0.7 } else { if (maxCor < 0.5) { maxCor <- 0.5 } }}}
  }
  numCols <- ceiling((maxCor-minCor)*50)
  tiff(file.path(rDir, paste(outfile, "_cor_mat.tiff", sep = '')), width = 1600, height = 1600, compression = "lzw", type = "Xlib")
  labelCex <- 1 + 1/log10(nrow(sampleCorMat))
  noteCex <- 4
  if (nrow(sampleCorMat) > 6) { noteCex <- 3 }
  if (nrow(sampleCorMat) > 12) { noteCex <- 2 }
  if (nrow(sampleCorMat) > 18) { noteCex <- 1 }
  if (nrow(sampleCorMat) > 30) { #no notes
    heatmap.2(sampleCorMat, col = f.blueredyellow(numCols), trace="none", scale = "none", margins = c(15,15), breaks = seq(minCor, maxCor, length.out = numCols+1))
  } else {
    heatmap.2(sampleCorMat, col = f.blueredyellow(numCols), trace="none", scale = "none", margins = c(15,15), cellnote = round(sampleCorMat,2), notecol = "black", notecex = noteCex, breaks = seq(minCor, maxCor, length.out = numCols+1), cexRow = labelCex, cexCol = labelCex)
  }
  dev.off()
}

f.histogram <- function(x, y = c(), xName = "", useFreq = FALSE, doNotShowSummary = FALSE, useLog = FALSE, ...) 
{
  # this is a normal histogram mode (not bars, but frequency or density)
  # x: values
  # xName: x-axis label
  # useFreq: will plot frequencies instead of densities
  if (length(x) == length(y)) {
    toPlot <- list(x = x, y = y)
    class(toPlot) <- "density"
  } else {
    toPlot <- density(x)
  }
  if (useFreq) { 
    toPlot$y <- toPlot$y * length(x)
  }
  if (useLog) {
    if (useFreq) { toPlot$y <- log2(toPlot$y+1) }
    else { cat("log only possible for useFreq = TRUE\n") }
  }
  par(mar = c(5,4,1,1))
  plot(
    toPlot,
    #main = "",
    bty = "n",
    xaxs = "r",
    yaxs = "r",
    xlab = xName,
    ylab = "",
    las = 1,
    cex = 0.4,
    tck = 0.01,
    ...
  )
  if (!doNotShowSummary) {
    text(
      par("usr")[1] + par("usr")[2]/15,
      par("usr")[4] - par("usr")[4]/15,
      adj = c(0,1),
      labels = paste(c("median:", "mean:", "sd:"),
                     c(round(median(x), digits = 3),
                       round(mean(x), digits = 3),
                       round(sd(x), digits = 3)),
                     collapse = "\n", sep=" ")
    )
  }
}

f.scatter <- function(x, y, xName = "", yName = "", cexFactor = 1, ...)
{
  # this will draw a normal scatterplot
  # x: x values
  # y: y values
  # xName: x-axis label
  # yName: y-axis label
  par(mar = c(5,5,1,1))
  plot(
    y ~ x,
    #main = "",
    bty = "n",
    xaxs = "r",
    yaxs = "r",
    xlab = xName,
    ylab = yName,
    las = 1,
    cex = 0.2*cexFactor,
    tck = 0.01,
    pch = 19,
    ...
  )
  text(
    par("usr")[1] + par("usr")[2]/15,
    par("usr")[4] - par("usr")[4]/15,
    adj = c(0,1),
    labels = paste(c("corP:", "corS:", "n:"),
                   c(round(cor(x,y,method="pearson"), digits = 3),
                     round(cor(x,y,method="spearman"), digits = 3),
                     length(x)),
                   collapse = "\n", sep=" ")
  )
}

f.genetodensity <- function(x,y) {
  require("MASS")
  est <- kde2d(x, y, n = 50) # needs MASS - calculates the two dimensional density
  if (sum(is.na(est$z)) > 0) {
    est <- kde2d(x, y, n = 50, h = c(bw.nrd0(x), bw.nrd0(y))) # needs MASS - calculates the two dimensional density
  }
  dvec <- as.vector(est$z) # note: as.vector takes columnwise, rows correspond to the value of x, columns to the value of y - the y index counts therefore 50 times
  xind <- findInterval(x, est$x) # find the intervals to which a gene belongs
  yind <- findInterval(y, est$y) # find the intervals to which a gene belongs
  vecind <- (yind-1)*50 + xind # this is a vector with indices for dvec. 
  dens <- dvec[vecind]
  names(dens) <- names(x)
  return(dens)
}

f.genetodensitycolor <- function(x,y) {
  require("MASS")
  require("colorRamps")
  ## NOTE would not work if two spots have exactly the same density or?
  ## important for a nice picture is that one removes the grid points with no gene (the ones with the lowest density)
  grids <- 100
  out <- list()
  est <- kde2d(x, y, n = grids) # needs MASS - calculates the two dimensional density
  if (sum(is.na(est$z)) > 0) {
    est <- kde2d(x, y, n = grids, h = c(bw.nrd0(x), bw.nrd0(y))) # needs MASS - calculates the two dimensional density
  }
  dvec <- as.vector(est$z) # note: as.vector takes columnwise, rows correspond to the value of x, columns to the value of y - the y index counts therefore grids times
  xind <- findInterval(x, est$x) # find the intervals to which a gene belongs
  yind <- findInterval(y, est$y) # find the intervals to which a gene belongs
  vecind <- (yind-1)*grids + xind # this is a vector with indices for dvec. 
  dens <- dvec[vecind]
  dens_with_genes <- unique(dens)
  names(dens) <- names(x)
  out$dens <- dens
  out$denschar <- as.vector(dens, mode = "character")
  colgrad <- blue2red(length(dens_with_genes))
  #colgrad <- rainbow(length(dens_with_genes), start = 0, end = 1, alpha = 0.8)
  #colgrad <- heat.colors(length(dens_with_genes), alpha = 0.8)
  #colgrad <- terrain.colors(length(dens_with_genes), alpha = 0.8)
  #colgrad <- topo.colors(length(dens_with_genes), alpha = 0.8)
  #colgrad <- cm.colors(length(dens_with_genes), alpha = 0.8)
  sdens_with_genes <- sort(dens_with_genes, decreasing = FALSE)
  names(colgrad) <- sdens_with_genes
  out$cols <- colgrad
  genecols <- colgrad[as.vector(dens, mode = "character")]
  names(genecols) <- names(x)
  out$genecols <- genecols
  return(out)
}

f.replace.Inf.by.next <- function(x) {
  if (sum(abs(x) == Inf, na.rm = TRUE) == 0) {return(x)}
  cat("replacing Inf/-Inf with the next highest/lowest value\n")
  notNA <- !is.na(x)
  minInf <- x == -Inf
  pluInf <- x == Inf
  nx <- x[notNA&(!minInf)&(!pluInf)]
  x[notNA&pluInf] <- max(nx)
  x[notNA&minInf] <- min(nx)
  return(x)
}

f.density.scatter <- function(x, y, xName = "", yName = "", main = "", tryToScale = FALSE, ...) {
  x <- f.replace.Inf.by.next(x)
  y <- f.replace.Inf.by.next(y)
  if (tryToScale) {
    if (max(x) > 10*median(x)) {
      isExtreme <- x > 10*median(x)
      cat("limiting data to 10*median(x) - i.e. replacing", sum(isExtreme), "in percent:", mean(isExtreme)*100," extreme values\n")
      x[isExtreme] <- 10*median(x)
    }
    if (max(y) > 10*median(y)) {
      isExtreme <- y > 10*median(y)
      cat("limiting data to 10*median(y) - i.e. replacing", sum(isExtreme), "in percent:", mean(isExtreme)*100," extreme values\n")
      y[isExtreme] <- 10*median(y)
    }
  }
  #   xLimits <- range(x)
  #   if (max(x) > 10*median(x)) {
  #     cat("limiting plot to a quantile(x, 0.99)\n")
  #     xLimits[2] <- quantile(x, 0.99)
  #   }
  #   yLimits <- range(y)
  #   if (max(y) > 10*median(y)) {
  #     cat("limiting plot to a quantile(y, 0.99)\n")
  #     yLimits[2] <- quantile(y, 0.99)
  #   }
  colorGrad <- f.genetodensitycolor(x, y)
  # f.scatter(x, y, xName, yName, cexFactor = 1, col = colorGrad$genecols, main = main, xlim = xLimits, ylim = yLimits)
  f.scatter(x, y, xName, yName, cexFactor = 1, col = colorGrad$genecols, main = main, ...)
  return(NULL)
}

f.allinone <- function(data, rdir, rt = 0, prefix = "", useSVG = FALSE, onWin = FALSE, scaleToSample = FALSE) 
{
  # this function draws scatterplots, histograms and sorted scatters
  # x <dataframe/matrix>: values, one sample in one columns
  # rdir <directory>: folder for the figure
  # rt <number>: a threshold
  # prefix <string>: a word that will be added to the filename
  # useSVG <bool>: tells whether to draw an SVG or a TIFF
  # onWin <bool>: use this if you use Windows
  hits.union <- rownames(data)[which(apply(f.comp(data,rt),1,sum) > 0)]
  data <- data[hits.union,]
  # get sorted data and make sure everything is a data frame
  s.data <- apply(data,2,sort)
  data <- as.data.frame(data)
  s.data <- as.data.frame(s.data)
  # some variables
  ns = ncol(data)
  sn = colnames(data)
  # use CairoSVG if you are on windows
  afname <- paste(prefix,'_allinone_',rt,'_',paste(sn,collapse='_'),sep='')
  if (nchar(afname) > 50) { afname <- paste(prefix,'_allinone_',rt,'_many',sep='') }
  if (useSVG) {
    #print("use: rsvg-convert -a -d 300 -p 300 svgfile > pngfile")
    afpath <- file.path(rdir,paste(afname, '.svg', sep = ''))
    if (onWin) { CairoSVG(file = afpath) } 
    else { svg(filename = afpath, onefile = TRUE, bg = FALSE, antialias = "default") }
  } else {
    afpath <- file.path(rdir,paste(afname, '.tiff', sep = ''))
    tiff(file=afpath, width = 600*ns, height = 600*ns, units = "px", pointsize = 20, compression = "lzw", bg = "white", type = "Xlib")
  }
  # plot everything
  layout(mat=matrix(c(1:(ns*ns)), nrow=ns, byrow = TRUE))
  cat(paste("plotting", ns, "rows\n"))
  cat(paste(c("|", rep(' ', ns),"|\n|"),sep='',collapse=''))
  for (i in c(1:ns))
  {
    for (k in c(1:ns))
    {
      if (scaleToSample) {
        xMin <- floor(min(data[,k]))
        xMax <- ceiling(max(data[,k]))
        yMin <- floor(min(data[,i]))
        yMax <- ceiling(max(data[,i]))
      } else {
        xMin <- floor(min(data))
        xMax <- ceiling(max(data))
        yMin <- floor(min(data))
        yMax <- ceiling(max(data))
      }
      if (i == k) 
      {
        # histogram in the diagonal
        f.histogram(data[f.comp(data[,k],rt),k], xName = sn[k], xlim = c(xMin, xMax), main = "")
      }
      else if (i < k)
      { 
        # scatterplot with densities on the top right
        if ((length(unique(data[,k][!is.na(data[,k])])) > 2) & (length(unique(data[,i][!is.na(data[,i])])) > 2)) {
          colorGrad <- f.genetodensitycolor(data[,k], data[,i])
        } else {
          colorGrad <- list(genecols = rep("black", nrow(data)))
        }
        f.scatter(data[,k], data[,i], sn[k], sn[i], cexFactor = 1.5, col = colorGrad$genecols, xlim = c(xMin, xMax), ylim = c(yMin, yMax), main = "")
        abline(0,1,col="black")
      }
      else if (i > k)
      {
        # sorted scatterplot on the bottom left
        f.scatter(s.data[,k], s.data[,i], sn[k], sn[i], cexFactor = 1.5, xlim = c(xMin, xMax), ylim = c(yMin, yMax), main = "")
        abline(0,1,col="darkorchid3")
      }
    }
    cat("#")
  }
  cat('|\n')
  dev.off()
  if (useSVG) { 
    system(paste("rsvg-convert -a -d 300 -p 300 ", afpath," > ", afpath, ".png", sep = '')) 
    #system(paste("rm ", afpath, sep = ''))
  }
}

f.compare.ret.and.fea <- function(trainData, testData, ret, fea) {
  trainData$abs <- abs(trainData[[ret]])
  trainData$sig <- sign(trainData[[ret]])*rnorm(nrow(trainData), .5, .1)
  testData$abs <- abs(testData[[ret]])
  testData$sig <- sign(testData[[ret]])*rnorm(nrow(testData), .5, .1)
  trainData <- trainData[(!is.na(trainData[[ret]])) & (!is.na(trainData[[fea]])),]
  testData <- testData[(!is.na(testData[[ret]])) & (!is.na(testData[[fea]])),]
  layout(matrix(1:6, nrow = 2, byrow = TRUE))
  f.density.scatter(as.numeric(trainData[[fea]]), trainData[[ret]], fea, ret, "return")
  f.density.scatter(as.numeric(trainData[[fea]]), trainData$abs, fea, ret, "abs(return)") 
  f.density.scatter(as.numeric(trainData[[fea]]), trainData$sig, fea, ret, "sign(return)") 
  f.density.scatter(as.numeric(testData[[fea]]), testData[[ret]], fea, ret, "return")
  f.density.scatter(as.numeric(testData[[fea]]), testData$abs, fea, ret, "abs(return)") 
  f.density.scatter(as.numeric(testData[[fea]]), testData$sig, fea, ret, "sig(return)") 
  return(NULL)
}

f.compare.ret.and.fea.no.dens <- function(trainData, testData, ret, fea) {
  trainData$abs <- abs(trainData[[ret]])
  trainData$sig <- sign(trainData[[ret]])*rnorm(nrow(trainData), .5, .1)
  testData$abs <- abs(testData[[ret]])
  testData$sig <- sign(testData[[ret]])*rnorm(nrow(testData), .5, .1)
  trainData <- trainData[(!is.na(trainData[[ret]])) & (!is.na(trainData[[fea]])),]
  testData <- testData[(!is.na(testData[[ret]])) & (!is.na(testData[[fea]])),]
  layout(matrix(1:6, nrow = 2, byrow = TRUE))
  f.scatter(as.numeric(trainData[[fea]]), trainData[[ret]], fea, ret, cexFactor = 1, col = "black", main = "return")
  f.scatter(as.numeric(trainData[[fea]]), trainData$abs, fea, ret, cexFactor = 1, col = "black", main = "abs(return)") 
  f.scatter(as.numeric(trainData[[fea]]), trainData$sig, fea, ret, cexFactor = 1, col = "black", main = "sign(return)") 
  f.scatter(as.numeric(testData[[fea]]), testData[[ret]], fea, ret, cexFactor = 1, col = "black", main = "return")
  f.scatter(as.numeric(testData[[fea]]), testData$abs, fea, ret, cexFactor = 1, col = "black", main = "abs(return)") 
  f.scatter(as.numeric(testData[[fea]]), testData$sig, fea, ret, cexFactor = 1, col = "black", main = "sig(return)") 
  return(NULL)
}

some functions for imputations and so on

f.get.black.list <- function(train, test, featuresToCheck) {
  cat("checking features for difference between train and test\n")
  checks <- matrix(0, nrow = length(featuresToCheck), ncol = 3, dimnames = list(featuresToCheck, c("est", "pVal", "fdr")))
  for (thing in featuresToCheck) {
    temp <- t.test(as.numeric(train[[thing]]), as.numeric(test[[thing]]), na.action = na.omit)
    checks[thing, "est"] <- temp$statistic
    checks[thing, "pVal"] <- temp$p.value
    checks[thing, "fdr"] <- p.adjust(temp$p.value, method = "BH")
  }
  out <- rownames(checks)[checks[, "fdr"] < 0.01]
  print(checks[out,])
  return(out)
}

f.simple.imputation.with.nearest.neighbors <- function(data, nnq = 0.01, impFun = median) {
  require("cluster")
  dsm <- NULL
  nnm <- NULL
  dsmCount <- 0
  nnmCount <- 0
  while (sum(c(is.null(dsm), is.null(nnm))) > 0) { 
    if (is.null(dsm)) {
      dsmCount <- dsmCount + 1
      cat("calculating dissimilarity...", dsmCount, "\n")
      dsm <- as.matrix(daisy(data, "gower"))
    }
    if (is.null(nnm)) {
      nnmCount <- nnmCount + 1
      cat("searching nearest neighbors...", nnmCount, "\n")
      if (nnmCount > 1) { nnq <- nnq + 0.1*nnq }
      nnm <- t(apply(dsm, 1, function(x) which(x < quantile(x, nnq)))) # it includes itself, but actually no problem
    }
  }
  cat("replacing NAs\n")
  for (i in 1:nrow(data)) {
    if ((i %% 1000) == 0) { cat('#') }
    if ((i %% 10000) == 0) { cat('\n') }
    mask <- is.na(data[i,])
    if (sum(mask) == 0) { next }
    if (sum(mask) == 1) {
      data[i, mask] <- impFun(data[nnm[i,],mask], na.rm = TRUE)
    } else {
      data[i, mask] <- apply(data[nnm[i,],mask], 2, function(x) impFun(x, na.rm = TRUE))
    }
  }
  cat('\n')
  return(data)
}

f.simple.imputation.with.nearest.neighbors.sampled <- function(data, nnq = 0.01, impFun = mean, iter = 10, splitInto = 4, numCores = splitInto) {
  require("parallel")
  temp <- array(0, c(nrow(data), ncol(data), iter), list(rownames(data), colnames(data), paste0("iter_", 1:iter)))
  for (i in 1:iter) {
    splitBy <- sample.int(splitInto, nrow(data), replace = TRUE)
    subSets <- split(data, splitBy)
    subSetsImp <- mclapply(subSets, function(x) f.simple.imputation.with.nearest.neighbors(x, nnq, impFun), mc.cores = numCores)
    #subSetsImp <- do.call("rbind", subSetsImp)
    #rownames(subSetsImp) <- sapply(rownames(subSetsImp), function(x) unlist(strsplit(x, '.', TRUE))[2])
    for (ssi in subSetsImp) {
      tt <- as.matrix(ssi)
      temp[rownames(tt), colnames(tt), paste0("iter_", i)] <- tt
    }
  }
  out <- apply(temp, c(1,2), function(x) median(as.numeric(x), na.rm = TRUE))
  return(out)
}

f.simple.imputation.with.nearest.neighbors.sampled.VIM <- function(data, knn = 6, iter = 10, splitInto = 8, numCores = 4) {
  require("parallel")
  require("VIM")
  out <- array(0, c(nrow(data), ncol(data), iter), list(rownames(data), colnames(data), paste0("iter_", 1:iter)))
  for (i in 1:iter) {
    temp <- matrix(0, nrow(data), ncol(data), dimnames = list(rownames(data), colnames(data)))
    splitBy <- sample.int(splitInto, nrow(data), replace = TRUE)
    subSets <- split(data, splitBy)
    subSetsImp <- mclapply(subSets, function(x) kNN(x, k = knn, numFun = mean), mc.cores = numCores)
    for (ssi in subSetsImp) {
      for (cn in colnames(temp)) {
        temp[rownames(ssi), cn] <- ssi[, cn]
      }
    }
    out[,,i] <- temp
  }
  #out <- apply(out, c(1,2), function(x) median(as.numeric(x), na.rm = TRUE))
  return(out)
}

f.checkDupliBool <- function(x) {
  x <- x[!is.na(x)]
  temp <- table(x)
  if (sum(temp>1) != 0) {
    out <- TRUE
  } else {
    out <- FALSE
  }
}

f.checkDupli <- function(x) {
  x <- x[!is.na(x)]
  temp <- table(x)
  dupliCount <- sum(temp>1)
  return(dupliCount)
}

f.checkDupliVal <- function(x) {
  x <- x[!is.na(x)]
  temp <- table(x)
  dupliCount <- sum(temp>1)
  if (dupliCount == 0) { out <- NA}
  else if (dupliCount == 1) { out <- as.numeric(names(temp)[temp > 1])}
  else { out <- paste(names(temp)[temp > 1], collapse = '|') }
  return(out)
}

filter, normalization, and model functions

f.filter.handpicked <- function(data) {
  trainMask_D0 <- abs(data$Ret_D0) < 4e-4; trainMask_D0[is.na(trainMask_D0)] <- TRUE; print(sum(!trainMask_D0))
  trainMask_MO <- abs(data$Ret_MinusOne) < .2; trainMask_MO[is.na(trainMask_MO)] <- TRUE; print(sum(!trainMask_MO))
  trainMask_MT <- abs(data$Ret_MinusTwo) < .2; trainMask_MT[is.na(trainMask_MT)] <- TRUE; print(sum(!trainMask_MT))
  trainMask_F3 <- data$Feature_3 > -2; trainMask_F3[is.na(trainMask_F3)] <- TRUE; print(sum(!trainMask_F3))
  trainMask_F4 <- data$Feature_4 > -4; trainMask_F4[is.na(trainMask_F4)] <- TRUE; print(sum(!trainMask_F4))
  trainMask_F17 <- data$Feature_17 < 2; trainMask_F17[is.na(trainMask_F17)] <- TRUE; print(sum(!trainMask_F17))
  trainMask_F18 <- data$Feature_18 > -1.3; trainMask_F18[is.na(trainMask_F18)] <- TRUE; print(sum(!trainMask_F18))
  trainMask_F22 <- data$Feature_22 > -4.5; trainMask_F22[is.na(trainMask_F22)] <- TRUE; print(sum(!trainMask_F22))
  trainMask_F23 <- data$Feature_23 > -2; trainMask_F23[is.na(trainMask_F23)] <- TRUE; print(sum(!trainMask_F23))
  trainMask_F25 <- data$Feature_25 < 1; trainMask_F25[is.na(trainMask_F25)] <- TRUE; print(sum(!trainMask_F25))
  finalMask <- trainMask_D0 & trainMask_MO & trainMask_MT & trainMask_F3 & trainMask_F4 & trainMask_F22 & trainMask_F17 & trainMask_F18 & trainMask_F23 & trainMask_F25
  out <- data[finalMask,]
  return(out)
}

f.normalize.quantiles <- function(train, test, colsToNorm, quant = 1e-3) {
  for (colToNorm in colsToNorm) {
    trainQuants <- sort(quantile(train[[colToNorm]], seq(0, 1, quant), na.rm = TRUE)) # in some rare cases, two quantiles are almost equal but it complains about the sorting
    testQuants <- sort(quantile(test[[colToNorm]], seq(0, 1, quant), na.rm = TRUE))
    trainMap <- findInterval(train[[colToNorm]], trainQuants)
    testMap <- findInterval(test[[colToNorm]], testQuants)
    train[[colToNorm]] <- testQuants[trainMap]
    test[[colToNorm]] <- testQuants[testMap]
  }
  out <- list(train = train, test = test)
  return(out)
}

f.stahel.trafo <- function(x) {
  med <- median(x, na.rm = TRUE)
  qua <- quantile(x, 0.25, na.rm = TRUE)
  con <- med/((med/qua)^2.9)
  out <- log(x + con)
  return(out)
}

f.get.additional.variables <- function(data, blackList = c(), noInt = FALSE) {
  require("caTools")
  retCols <- grep("^Ret", colnames(data), value = TRUE)
  feaCols <- grep("^Feature", colnames(data), value = TRUE)
  weiCols <- grep("^Weight", colnames(data), value = TRUE)
  pastRet <- paste0("Ret_", c("MinusTwo", "MinusOne", 2:120))
  futuRet <- paste0("Ret_", c(121:180, "PlusOne", "PlusTwo"))
  originalCols <- colnames(data)
  returnsToSimplify <- paste0("Ret_", 2:120)
  if (length(blackList)) {
    returnsToSimplify <- setdiff(returnsToSimplify, blackList)
    feaCols <- setdiff(feaCols, blackList)
  }
  data$Ret_mean <- apply(data[,returnsToSimplify], 1, function(x) mean(x, na.rm = TRUE))
  data$Ret_median <- apply(data[,returnsToSimplify], 1, function(x) median(x, na.rm = TRUE))
  data$Ret_range <- apply(data[,returnsToSimplify], 1, function(x) max(x, na.rm = TRUE)-min(x, na.rm = TRUE))
  data$Ret_sumAbs <- apply(data[,returnsToSimplify], 1, function(x) sum(abs(x), na.rm = TRUE))
  data$Ret_var <- apply(data[,returnsToSimplify], 1, function(x) var(x, na.rm = TRUE))
  data$Ret_pos <- apply(data[,returnsToSimplify], 1, function(x) sum(x > 0, na.rm = TRUE))
  data$Ret_sum <- apply(data[,returnsToSimplify], 1, function(x) sum(x, na.rm = TRUE))
  data$Ret_sum_scaled <- data$Ret_sum/2*3
  data$Ret_past_mean <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) mean(x, na.rm = TRUE))
  data$Ret_past_sumAbs <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) sum(abs(x), na.rm = TRUE))
  data$Ret_past_range <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) max(x, na.rm = TRUE)-min(x, na.rm = TRUE))
  data$Ret_past_var <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) var(x, na.rm = TRUE))
  data$Ret_past_trend <- apply(data[,intersect(paste0("Ret_", 2:120), returnsToSimplify)], 1, function(x) sum(x, na.rm = TRUE))
  data$Ret_past_mean_trend <- apply(data[,intersect(paste0("Ret_", 2:120), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
  data$Ret_mean_trend_10 <- apply(data[,intersect(paste0("Ret_", 2:10), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
  data$Ret_mean_trend_15 <- apply(data[,intersect(paste0("Ret_", 6:15), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
  for (i in seq(20, 120, by = 5)) { 
    toSummarize <- intersect(paste0("Ret_", (i-10+1):i), returnsToSimplify)
    if (length(toSummarize) > 3) {
      data[[paste0("Ret_mean_trend_", i)]] <- apply(data[,], 1, function(x) mean(x, na.rm = TRUE))
    }
  }
#   if ("Ret_121" %in% colnames(data)) {
#     data$Ret_mean_trend <- apply(data[,paste0("Ret_", 121:180)], 1, function(x) mean(x, na.rm = TRUE))
#     data$Ret_mean_trend_130 <- apply(data[,paste0("Ret_", 121:130)], 1, function(x) mean(x, na.rm = TRUE))
#     data$Ret_mean_trend_135 <- apply(data[,paste0("Ret_", 126:135)], 1, function(x) mean(x, na.rm = TRUE))
#     for (i in seq(140, 180, by = 5)) { 
#       data[[paste0("Ret_past_mean_trend_", i)]] <- apply(data[,paste0("Ret_", (i-10+1):i)], 1, function(x) mean(x, na.rm = TRUE))
#     }
#   }
  # 
  simplifiedReturns <- setdiff(colnames(data), originalCols)
  # all feature interactions (pairwise multiplied)
  if (!noInt) {
    doubleInt <- combs(feaCols, 2)
    for (i in 1:nrow(doubleInt)) {
      curF <- doubleInt[i,]
      data[[paste0("int_", paste(curF, collapse = "X"))]] <- data[[curF[1]]]*data[[curF[2]]]
    }
  }
  # or add the triple feature interactions
#   tripleInt <- combs(feaCols, 3)
#   for (i in 1:nrow(tripleInt)) {
#     curF <- tripleInt[i,]
#     data[[paste(curF, collapse = "X")]] <- data[[curF[1]]]*data[[curF[2]]]*data[[curF[3]]]
#   }
  # and add the interactions of the features with all the past returns paste0(fea, "X", pastRet) >> ctrl-a-0 - also not too splendid (there are too many single past return points)
#   for (fea in feaCols) {
#     for (pas in pastRet) {
#       data[[paste0(fea, "X", pas)]] <- data[[fea]]*data[[pas]]
#     }
#   }
  # the same for the simplified returns >> ctrl-a-1 - not so great - how about only this
#   for (fea in feaCols) {
#     for (sim in simplifiedReturns) {
#       data[[paste0(fea, "X", sim)]] <- data[[fea]]*data[[sim]]
#     }
#   }
  
  out <- list(
    data = data, 
    simplifiedReturns = simplifiedReturns
  )
  if (!noInt) {
    out$dointCols = apply(doubleInt, 1, function(x) paste0("int_", paste(x, collapse = 'X')))
    # out$trintCols = apply(tripleInt, 1, function(x) paste(x, collapse = 'X')),
    # out$pasIntCols = paste0(rep(feaCols, each = length(pastRet)), "X", pastRet),
    # out$simIntCols = paste0(rep(feaCols, each = length(simplifiedReturns)), "X", simplifiedReturns)
  }
  return(out)
}

f.prepare.for.models <- function(train, test, type, doPreprocess = TRUE, toFactor = c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20"), addAnyway = c(), blackList = c(), noInt = FALSE, useQuantileNormalization = FALSE) {
  require("caret")
  #########################################################################################
  ### some things for data access
  retCols <- grep("^Ret", colnames(train), value = TRUE)
  feaCols <- grep("^Feature", colnames(train), value = TRUE)
  weiCols <- grep("^Weight", colnames(train), value = TRUE)
  pastRet <- paste0("Ret_", c("MinusTwo", "MinusOne", 2:120))
  futuRet <- paste0("Ret_", c(121:180, "PlusOne", "PlusTwo"))
  #########################################################################################
  ### calculate some additional variables
  toFactor <- setdiff(toFactor, blackList)
  if (type != "intra") {
    cat("calculating additional variables...\n")
    trainDataAndVars <- f.get.additional.variables(train, blackList, noInt)
    testDataAndVars <- f.get.additional.variables(test, blackList, noInt)
    train <- trainDataAndVars$data
    test <- testDataAndVars$data
  } else {
    if (!noInt) {
      trainDataAndVars <- f.get.additional.variables(train, blackList, noInt)
      testDataAndVars <- f.get.additional.variables(test, blackList, noInt)
      train <- trainDataAndVars$data
      test <- testDataAndVars$data
    }
  }
  #########################################################################################
  ### check for correlation with interactions and the regular features as well
  # intCols <- c(pastRet, trainDataAndVars$pasIntCols, trainDataAndVars$dointCols, trainDataAndVars$trintCols, trainDataAndVars$simplifiedReturns, trainDataAndVars$simIntCols)
  cat("check for correlation with target...\n")
  if (type == "inter") {targetVars <- c("Ret_PlusOne", "Ret_PlusTwo")}
  else if (type == "intra") {targetVars <- paste0("Ret_", 121:180)}
  else {targetVars <- type}
  if (type != "intra") { 
    intCols <- unique(c("Ret_MinusTwo", "Ret_MinusOne", trainDataAndVars$simplifiedReturns))
  } else {
    intCols <- c("Ret_MinusTwo", "Ret_MinusOne")
  }
  if (noInt) {
    cutOff <- .25
  } else {
    intCols <- unique(c(intCols, trainDataAndVars$dointCols))
    cutOff <- .75
  }
  intCors <- cor(train[,targetVars], train[, c(intCols, feaCols)], method = "spearman", use = "complete.obs")
  if (length(targetVars) == 1) {
    corSums <- intCors[1,]
  } else {
    corSums <- apply(abs(intCors[targetVars,]), 2, sum)
  }
  corSums <- corSums[setdiff(names(corSums), grep("Ret_[[:digit:]]{1,3}|_trend_", names(corSums), value = TRUE))]
  intVars <- names(corSums)[corSums > quantile(corSums, cutOff)]
  #########################################################################################
  ### convert the factors
  cat("convert factors...\n")
  for (ff in toFactor) {
    train[[ff]] <- as.factor(train[[ff]])
    test[[ff]] <- as.factor(test[[ff]])
  }
  #########################################################################################
  ### check for near zero variance and remove highly correlated features
  cat("check near-zero variance and remove highly correlated predictors...\n")
  nzv <- nearZeroVar(train[,intVars], saveMetrics= TRUE)
  temp <- train[,setdiff(intVars, toFactor)]
  descrCor <- cor(temp, use = "complete.obs")
  highlyCor <- colnames(temp)[findCorrelation(descrCor, cutoff = .75)]
  intVars <- setdiff(intVars, Reduce(union, list(rownames(nzv)[nzv$nzv], highlyCor)))
  intVars <- unique(c(intVars, addAnyway))
  #########################################################################################
  ### preprocess data (needs to be repeated for some vars in the intra day models)
  if (sum(apply(train, 2, function(x) sum(is.na(x)))) == 0) {
    if (doPreprocess) {
      cat("preprocessing data...\n")
#       comb <- rbind(train[, setdiff(intVars, toFactor)], test[, setdiff(intVars, toFactor)])
#       preProComb <- preProcess(comb, method = c("center", "scale", "YeoJohnson"))
#       tempComb <- predict(preProComb, comb)
      preproTrain <- preProcess(train[, setdiff(intVars, toFactor)], method = c("center", "scale", "YeoJohnson"))
      preproTest <- preProcess(test[, setdiff(intVars, toFactor)], method = c("center", "scale", "YeoJohnson"))
      tempTrain <- predict(preproTrain, train[, setdiff(intVars, toFactor)])
      tempTest <- predict(preproTest, test[, setdiff(intVars, toFactor)])
      for (cn in setdiff(intVars, toFactor)) { 
#         train[[cn]] <- tempComb[rownames(train), cn]
#         test[[cn]] <- tempComb[rownames(test), cn]
        train[[cn]] <- tempTrain[[cn]]
        test[[cn]] <- tempTest[[cn]]
      }
    }
  } else {
    cat("skip preprocessing data due to NAs...\n")
  }
  #########################################################################################
  ### preprocess with quantile normalization
  if (useQuantileNormalization) {
    if (doPreprocess) {stop("either use doPreprocess or useQuantileNormalization")}
    contVars <- setdiff(colnames(test), toFactor)
    preproData <- f.normalize.quantiles(train, test, contVars, quant = 0.0001)
    train <- preproData$train
    test <- preproData$test
  }
  #########################################################################################
  ### return
  out <- list(train = train, test = test, vars = intVars)
  return(out)
}

f.inter.day.predictions <- function(train, test, toPredict, predVars, modMet = "rpart", evaMet = "adaptive_cv") {
  #########################################################################################
#   cat("subsetting data...\n")
#   inTrain <- createDataPartition(y = train[[toPredict]], p = .5, list = FALSE)
#   trainSet <- train[inTrain,]
#   testSet <- train[-inTrain,]
  #########################################################################################
  cat("running model with", length(predVars), "variables...\n")
  predVars <- setdiff(predVars, c(toPredict))
  numReps <- ifelse(modMet == "rpart", 50, 10)
  myCtrl <- trainControl(method = evaMet, number = 10, repeats = numReps)
  if (modMet == "ranger") {
    formulaString <- paste0(toPredict, '~', paste(predVars, collapse = '+'))
    myFit <- train(formula(formulaString), train[,c(toPredict, predVars)], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
  } else {
    myFit <- train(x = train[,predVars], y = train[[toPredict]], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
  }
  predValsForScore <- predict(myFit, train)
  baseLine <- f.WMAE(train$Weight_Daily, train[[toPredict]], rep(0, nrow(train)))
  meanLine <- f.WMAE(train$Weight_Daily, train[[toPredict]], mean(train[[toPredict]], na.rm = TRUE))
  diffToNoPred <- f.WMAE(train$Weight_Daily, train[[toPredict]], predValsForScore) - baseLine
  diffToNoPredMean <- f.WMAE(train$Weight_Daily, train[[toPredict]], predValsForScore) - meanLine
  cat("Difference to zero prediction:", diffToNoPred, "... in percent:", diffToNoPred/baseLine*100, "\n")
  cat("Difference to mean prediction:", diffToNoPredMean, "... in percent:", diffToNoPredMean/meanLine*100, "\n")
  predValsForSubmission <- predict(myFit, test)
  #########################################################################################
  out <- list(mod = myFit, pred = predValsForSubmission, score = diffToNoPred/baseLine*100)
  return(out)
}

f.get.additional.variables.intra.test <- function(data, timePointToPredict, memory = 30, win = 10, blackList = c()) {
  validVars <- c(paste0("Ret_", 2:120), paste0("Ret_pred_", 121:180))
  if (length(blackList)) { validVars <- setdiff(validVars, blackList) }
  previous <- (timePointToPredict-memory):(timePointToPredict-1)
  previous <- intersect(c(paste0("Ret_", previous), paste0("Ret_pred_", previous)), validVars)
  # winNums <- 1:(length(previous)-win)
  winNums <- seq(1, length(previous)-win, by = ceiling(win/2))
  cat("calculating indicators...\n")
  for (i in winNums) {
    winToUse <- previous[i:(i+win)]
    data[[paste0("win_mean", i)]] <- apply(data[,winToUse], 1, function(x) mean(x, na.rm = TRUE))
    data[[paste0("win_sumAbs", i)]] <- apply(data[,winToUse], 1, function(x) sum(abs(x), na.rm = TRUE))
    data[[paste0("win_var", i)]] <- apply(data[,winToUse], 1, function(x) sum(abs(x), na.rm = TRUE))
  }
  return(data)
}

f.on.the.fly.intra.day.predictions <- function(train, test, timePointToPredict, predVars, memory = 30, win = 10, modMet = "rpart", evaMet = "adaptive_cv", blackList = c(), useQuantileNormalization = FALSE) {
  #########################################################################################
  ### calculate some more indicators (sliding windows)
  origPredVars <- predVars
  cat("calculating sliding windows...\n")
  train <- f.get.additional.variables.intra.test(train, timePointToPredict, memory, win, blackList)
  test <- f.get.additional.variables.intra.test(test, timePointToPredict, memory, win, blackList)
  winVars <- grep("^win_", colnames(train), value = TRUE)
  sdev <- apply(train[,winVars], 2, function(x) sd(x, na.rm = TRUE))
  winVars <- names(sdev)[sdev > 0]
  #########################################################################################
  ### some preprocessing of the new variables
  cat("preprocessing data...\n")
  if (useQuantileNormalization) {
    preproData <- f.normalize.quantiles(train, test, winVars, quant = 0.0001)
    train <- preproData$train
    test <- preproData$test
  } else {
    preproTrain <- preProcess(train[, winVars], method = c("center", "scale", "YeoJohnson"))
    preproTest <- preProcess(test[, winVars], method = c("center", "scale", "YeoJohnson"))
    train[,winVars] <- predict(preproTrain, train[, winVars])[,winVars]
    test[,winVars] <- predict(preproTest, test[, winVars])[,winVars]
  }
  #########################################################################################
  ### check for correlation with windows and prefilter
  origWinVars <- winVars
  cat(length(winVars), "windows with sd > 0\n")
  cors <- cor(train[,paste0("Ret_", timePointToPredict)], train[, winVars], method = "spearman")[1,]
  winVars <- names(cors)[cors > quantile(cors, .75, na.rm = TRUE)]
  #########################################################################################
  # cat("subsetting data...\n")
  toPredict <- paste0("Ret_", timePointToPredict)
  #inTrain <- createDataPartition(y = train[[toPredict]], p = .5, list = FALSE)
  #trainSet <- train[inTrain,]
  #testSet <- train[-inTrain,]
  #########################################################################################
  predVars <- unique(c(predVars, winVars))
  useAnyway <- paste0(ifelse(timePointToPredict == 121, "Ret_", "Ret_pred_"), timePointToPredict-1) # there is some autocorrelation here
  predVars <- unique(predVars, useAnyway)
  missing <- predVars[!(predVars %in% colnames(train))]
  if (length(missing)) {cat(missing, "is not in the data frame\n")}
  cat("running model with", length(predVars), "variables...\n")
  newVar <- paste0("Ret_pred_", timePointToPredict)
  numReps <- ifelse(modMet == "rpart", 50, 10)
  myCtrl <- trainControl(method = evaMet, number = 10, repeats = numReps)
  colsWithNA <- apply(train[,c(toPredict, predVars)], 2, function(x) sum(is.na(x)))
  cat(names(colsWithNA)[colsWithNA > 0], "\n")
  if (modMet == "ranger") {
    formulaString <- paste0(toPredict, '~', paste(predVars, collapse = '+'))
    cat(formulaString, "\n")
    myFit <- train(formula(formulaString), train[,c(toPredict, predVars)], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
  } else {
  myFit <- train(x = train[,predVars], y = train[[toPredict]], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
  }
  predValsForScore <- predict(myFit, train)
  baseLine <- f.WMAE(train$Weight_Intraday, train[[toPredict]], rep(0, nrow(train)))
  diffToNoPred <- f.WMAE(train$Weight_Intraday, train[[toPredict]], predValsForScore) - baseLine
  cat("Difference to zero prediction:", diffToNoPred, "... in percent:", diffToNoPred/baseLine*100, "\n")
  train[[newVar]] <- predict(myFit, train)
  test[[newVar]] <- predict(myFit, test)
  #########################################################################################
  ### NOTE: for security, remove the windows again
  train <- train[, setdiff(colnames(train), origWinVars)]
  test <- test[, setdiff(colnames(test), origWinVars)]
  out <- list(train = train, test = test, vars = origPredVars, mod = myFit, score = diffToNoPred/baseLine*100)
  return(out)
}

f.WMAE <- function(we, y, yHat) { sum(as.numeric(we)*abs(as.numeric(y)-as.numeric(yHat)))/length(we) }

the actual processing

#########################################################################################
### imputation - libraries
library("VIM")
library("mice")
library("randomForest")
library("cluster")
library("ranger")
library("dbscan")
library("rpart")
library("penalizedSVM")
library("mlr")
library("proxy")
library("party")
library("neuralnet")
library("brnn")
library("qrnn")
library("anfis")
library("frbs")
library("rPython")
library("caret")
library("rpart")
library("doMC")
registerDoMC(cores = 32)

#########################################################################################
### data reading and transformation
rDir <- "/home/ubuntu/Winton"
trainDataWithNA <- read.csv(file.path(rDir, "train.csv"), stringsAsFactors = FALSE, row.names = 1)
testDataWithNA <- read.csv(file.path(rDir, "test_2.csv"), stringsAsFactors = FALSE, row.names = 1)
rownames(trainDataWithNA) <- paste0("tr_", 1:nrow(trainDataWithNA))
rownames(testDataWithNA) <- paste0("te_", 1:nrow(testDataWithNA))
for (fea in paste0("Feature_", c(7,9,15))) {
  trainDataWithNA[[fea]] <- f.stahel.trafo(trainDataWithNA[[fea]])
  testDataWithNA[[fea]] <- f.stahel.trafo(testDataWithNA[[fea]])
}
commonCols <- intersect(colnames(trainDataWithNA), colnames(testDataWithNA))
comb <- rbind(trainDataWithNA[,commonCols], testDataWithNA[,commonCols])

#########################################################################################
### I noticed that the training data does not seem to fit to the test data:
# "good models" performed very poor on the test data, but good on the training data (cross-validation)
# several returns are significantly different in the training data compared to the test data (t-test)
# many approaches including the past returns and the features (data present in both, the train and the test data)
# failed miserably. In the end it looks like the future returns (no NAs as well) separate the training data
# nicely into two large groups. Take only the future returns for which the model fit was always good.
### try dbscan - here one could use the randomForest proximity?
futureReturns <- paste0("Ret_", c("PlusOne", "PlusTwo", 121:180))
kNNdistplot(trainDataWithNA[,futureReturns])
dbsCl <- dbscan(trainDataWithNA[,futureReturns], .02, length(futureReturns)+1)
table(dbsCl$cluster)
rowsToIgnore <- rownames(trainDataWithNA)[dbsCl$cluster != 1]
write.table(rowsToIgnore, file.path(rDir, "notInCenter_dbscan.txt"), row.names = FALSE, col.names = FALSE, quote = FALSE)

formulaString <- paste0('~', paste(futureReturns, collapse = '+'))
myPCA <- princomp(formula(formulaString), trainDataWithNA[, futureReturns], na.action = na.omit)
plot(myPCA)
trafoData <- as.matrix(trainDataWithNA[, futureReturns])%*%as.matrix(myPCA$loadings[,c("Comp.1", "Comp.2")])
numToCol <- c("black", "firebrick2", "dodgerblue2", "darkorchid2", "darkgoldenrod2")
plot(trafoData[,"Comp.1"], trafoData[,"Comp.2"], pch = 16, col = numToCol[dbsCl$cluster+1])
f.density.scatter(trafoData[,"Comp.1"], trafoData[,"Comp.2"])

#########################################################################################
### have a closer look at the dbscan result - they are really different
# after removing the "notInCenter" rows, Ret_PlusOne and Ret_PlusTwo have very good fits based on only feature_7
# but of course - online the submissions are really trashy
# really awkward is that the correlation between Ret_PlusTwo and Feature_7 is really good in the "notInCenter" rows
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRows <- setdiff(rownames(trainDataWithNA), toRemove)
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], trainDataWithNA[validRows, "Ret_PlusTwo"])
f.density.scatter(trainDataWithNA[toRemove, "Feature_7"], trainDataWithNA[toRemove, "Ret_PlusTwo"])
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], abs(trainDataWithNA[validRows, "Ret_PlusOne"]))
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], abs(trainDataWithNA[validRows, "Ret_PlusTwo"]))

# also interesting, the correlations between the past returns are really different between the two sets
f.generic.correlation.matrix(trainDataWithNA[validRows,feaCols], rDir, "isCenter", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,feaCols], rDir, "notCenter", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[validRows,pastRet], rDir, "isCenter_pastRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,pastRet], rDir, "notCenter_pastRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[validRows,retCols], rDir, "isCenter_pastAndFutRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,retCols], rDir, "notCenter_pastAndFutRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(testDataWithNA[,pastRet], rDir, "test_pastRet", "spearman", corUse = "pairwise.complete.obs")

# this here did not change too much though -.- solve with normalization 
blackListBefore <- f.get.black.list(trainDataWithNA, testDataWithNA, colnames(testDataWithNA))
blackListAfter <- f.get.black.list(trainDataWithNA[setdiff(rownames(trainDataWithNA), toRemove),], testDataWithNA, colnames(testDataWithNA))
print(length(intersect(blackListBefore, blackListAfter))/length(union(blackListBefore, blackListAfter)))

#########################################################################################
### compare features and some past returns between train and test data - eventually use quantile normalization?
tempTrain <- trainDataWithNA[validRows,]
tempTest <- testDataWithNA
tempTrain$Ret_D0 <- apply(tempTrain[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
tempTest$Ret_D0 <- apply(tempTest[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
  tiff(file.path(rDir, "train_vs_test", paste0("train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
  testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
  trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
  f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
  abline(a = 0, b = 1, col = "red")
  if (toCheck %in% paste0("Ret_", 2:120)) {
    abline(h = 0.008, v=  0.008, col = "green")
    abline(h = -0.008, v = -0.008, col = "green")
  }
  dev.off()
}
# categorial: 1, 5, 9, 10, 13, 20
# normal: 2, 3, 6, 7, 15, 21, 24
# extreme case but fitting: 4
# single outlier: 11, 19, 22
# bent end: 14
# knee/bump: 17, 18, 23, 25
# not ok at all: 8, 16 (factors with few levels)
# D0: abs() < 4e-4
# Ret_MinusOne/Two: abs() < 0.02
tempTrain <- f.filter.handpicked(tempTrain); nrow(tempTrain)
tempTest <- f.filter.handpicked(tempTest); nrow(tempTest)
tempTrain <- tempTrain[apply(tempTrain[,paste0("Ret_", 2:120)], 1, function(x) sum(abs(x) > .006, na.rm = TRUE)) == 0, ]; nrow(tempTrain)
tempTest <- tempTest[apply(tempTest[,paste0("Ret_", 2:120)], 1, function(x) sum(abs(x) > .006, na.rm = TRUE)) == 0, ]; nrow(tempTest)
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
  tiff(file.path(rDir, "train_vs_test", paste0("filtered_train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
  testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
  trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
  f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
  abline(a = 0, b = 1, col = "red")
  dev.off()
}

validRowsTrain <- rownames(tempTrain)
validRowsTest <- rownames(tempTest)
write.table(validRowsTrain, file.path(rDir, "validRowsTrain.txt"), quote = FALSE, col.names = FALSE, row.names = FALSE) # 33103 / 32341
write.table(validRowsTest, file.path(rDir, "validRowsTest.txt"), quote = FALSE, col.names = FALSE, row.names = FALSE) # 101881 / 98764

# preproTrain <- preProcess(tempTrain[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))], method = c("center", "scale", "YeoJohnson"))
# preproTest <- preProcess(tempTest[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))], method = c("center", "scale", "YeoJohnson"))
# tempTrain <- predict(preproTrain, tempTrain[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))])
# tempTest <- predict(preproTest, tempTest[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))])
contVars <- c("Ret_MinusOne", "Ret_MinusTwo", paste0("Ret_", 2:120), paste0("Feature_", c(2:4,6,7,11,12,14,15,17:19,21:25)))
preproData <- f.normalize.quantiles(tempTrain, tempTest, contVars, quant = 0.01)
tempTrain <- preproData$train
tempTest <- preproData$test
tempTrain$Ret_D0 <- apply(tempTrain[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
tempTest$Ret_D0 <- apply(tempTest[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
  tiff(file.path(rDir, "train_vs_test", paste0("filtered_prePro_train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
  testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .01), na.rm = TRUE)
  trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .01), na.rm = TRUE)
  f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
  abline(a = 0, b = 1, col = "red")
  dev.off()
}

#########################################################################################
### again an imputation - only with the "validRowsTrain" and "validRowsTest"
# MICE (5 times 10 imputations)
offset <- 0
offset <- 10
offset <- 20
offset <- 30
offset <- 40
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRowsTrain <- scan(file.path(rDir, "validRowsTrain.txt"), what = "character")
validRowsTest <- scan(file.path(rDir, "validRowsTest.txt"), what = "character")
invalidRowsTrain <- setdiff(paste0("tr_", 1:4e4), validRowsTrain); length(invalidRowsTrain)
invalidRowsTest <- setdiff(paste0("te_", 1:12e4), validRowsTest); length(invalidRowsTest)
invalidRowsTrain <- setdiff(invalidRowsTrain, toRemove); length(invalidRowsTrain)
validRows <- union(validRowsTrain, validRowsTest); length(validRows)
invalidRows <- union(invalidRowsTrain, invalidRowsTest); length(invalidRows)
for (toFactor in c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")) { comb[[toFactor]] <- as.factor(comb[[toFactor]]) }
combValid <- comb[validRows,]; nrow(combValid)
combImpAllValid <- mice(combValid, m = 10, seed = 123)
for (i in 1:10) {
  temp <- complete(combImpAllValid, i)
  write.csv(temp, file.path(rDir, "mice_final", paste0("VALID_all_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
  write.csv(temp[validRowsTest,], file.path(rDir, "mice_final", paste0("VALID_test_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
  trainDataWithNA[validRowsTrain, colnames(temp)] <- temp[validRowsTrain,]
  write.csv(trainDataWithNA[validRowsTrain,], file.path(rDir, "mice_final", paste0("VALID_train_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
}
combInvalid <- comb[invalidRows,]; nrow(combInvalid)
combImpAllInvalid <- mice(combInvalid, m = 10, seed = 123)
for (i in 1:10) {
  temp <- complete(combImpAllInvalid, i)
  write.csv(temp, file.path(rDir, "mice_final", paste0("INVALID_all_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
  write.csv(temp[invalidRowsTest,], file.path(rDir, "mice_final", paste0("INVALID_test_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
  trainDataWithNA[invalidRowsTrain,colnames(temp)] <- temp[invalidRowsTrain,]
  write.csv(trainDataWithNA[invalidRowsTrain,], file.path(rDir, "mice_final", paste0("INVALID_train_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
}

# get also an average imputation from all the mice runs - VALID and INVALID data
library("VIM") # for maxCat
toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")
for (miceType in c("INVALID")) {
  trainData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_train_mice_imp_all_", 1,".csv")), stringsAsFactors = FALSE, row.names = 1)
  testData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_test_mice_imp_all_", 1,".csv")), stringsAsFactors = FALSE, row.names = 1)
  for (cn in toFactor) { 
    trainData[[cn]] <- as.factor(trainData[[cn]])
    testData[[cn]] <- as.factor(testData[[cn]])
  }
  tempTrain <- array(0, c(nrow(trainData), ncol(trainData), 50), list(rownames(trainData), colnames(trainData), paste0("iter_", 1:50)))
  tempTest <- array(0, c(nrow(testData), ncol(testData), 50), list(rownames(testData), colnames(testData), paste0("iter_", 1:50)))
  for (cn in colnames(trainData)) { tempTrain[rownames(trainData), cn, "iter_1"] <- trainData[[cn]] }
  for (cn in colnames(testData)) { tempTest[rownames(testData), cn, "iter_1"] <- testData[[cn]] }
  for (impRun in 2:50) {
    cat("processing imputation", impRun, "...\n")
    testData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_test_mice_imp_all_", impRun,".csv")), stringsAsFactors = FALSE, row.names = 1)
    trainData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_train_mice_imp_all_", impRun,".csv")), stringsAsFactors = FALSE, row.names = 1)
    for (cn in toFactor) { 
      trainData[[cn]] <- as.factor(trainData[[cn]])
      testData[[cn]] <- as.factor(testData[[cn]])
    }
  for (cn in colnames(trainData)) { tempTrain[rownames(trainData), cn, paste0("iter_", impRun)] <- trainData[[cn]] }
  for (cn in colnames(testData)) { tempTest[rownames(testData), cn, paste0("iter_", impRun)] <- testData[[cn]] }
  }
  quantCols <- setdiff(colnames(testData),toFactor)
  trainData[,toFactor] <- apply(tempTrain[,toFactor,], c(1,2), function(x) maxCat(x))
  trainData[,quantCols] <- apply(tempTrain[,quantCols,], c(1,2), function(x) median(as.numeric(x)))
  testData[,toFactor] <- apply(tempTest[,toFactor,], c(1,2), function(x) maxCat(x))
  testData[,quantCols] <- apply(tempTest[,quantCols,], c(1,2), function(x) median(as.numeric(x)))
  write.csv(testData, file.path(rDir, paste0(miceType, "_test_mice_imp_all_mean.csv")), quote = FALSE)
  write.csv(trainData, file.path(rDir, paste0(miceType, "_train_mice_imp_all_mean.csv")), quote = FALSE)
}

#########################################################################################
### again an imputation - only with the "validRowsTrain" and "validRowsTest"
# kNN with VIM (should solve the categorial problem)
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRowsTrain <- scan(file.path(rDir, "validRowsTrain.txt"), what = "character")
validRowsTest <- scan(file.path(rDir, "validRowsTest.txt"), what = "character")
invalidRowsTrain <- setdiff(paste0("tr_", 1:4e4), validRowsTrain); length(invalidRowsTrain)
invalidRowsTest <- setdiff(paste0("te_", 1:12e4), validRowsTest); length(invalidRowsTest)
invalidRowsTrain <- setdiff(invalidRowsTrain, toRemove); length(invalidRowsTrain)
validRows <- union(validRowsTrain, validRowsTest); length(validRows)
invalidRows <- union(invalidRowsTrain, invalidRowsTest); length(invalidRows)
toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")
for (cn in ) { comb[[cn]] <- as.factor(comb[[cn]]) }
combValid <- comb[validRows,]; nrow(combValid)
combInvalid <- comb[invalidRows,]; nrow(combInvalid)
knnValidIter <- f.simple.imputation.with.nearest.neighbors.sampled.VIM(combValid, splitInto = 8, numCores = 8)
knnValid <- combValid
knnValid[,toFactor] <- apply(knnValidIter[,toFactor,], c(1,2), function(x) maxCat(x))
knnValid[,setdiff(colnames(knnValid),toFactor)] <- apply(knnValidIter[,setdiff(colnames(knnValid),toFactor),], c(1,2), function(x) median(x))
knnInvalidIter <- f.simple.imputation.with.nearest.neighbors.sampled.VIM(combInvalid, splitInto = 8, numCores = 8)
knnInvalid <- combInvalid
knnInvalid[,toFactor] <- apply(knnInvalidIter[,toFactor,], c(1,2), function(x) maxCat(x))
knnInvalid[,setdiff(colnames(knnInvalid),toFactor)] <- apply(knnInvalidIter[,setdiff(colnames(knnInvalid),toFactor),], c(1,2), function(x) median(x))
for (cn in colnames(knnInvalid)) {
  cat("processing", cn, "...\n")
  trainDataWithNA[validRowsTrain, cn] <- knnValid[validRowsTrain, cn]
  testDataWithNA[validRowsTest, cn] <- knnValid[validRowsTest, cn]
  trainDataWithNA[invalidRowsTrain, cn] <- knnInvalid[invalidRowsTrain, cn]
  testDataWithNA[invalidRowsTest, cn] <- knnInvalid[invalidRowsTest, cn]
}
write.csv(trainDataWithNA[setdiff(rownames(trainDataWithNA), toRemove), ], file.path(rDir, "BOTH_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA, file.path(rDir, "BOTH_kNN_test.csv"), quote = FALSE)
write.csv(trainDataWithNA[validRowsTrain, ], file.path(rDir, "VALID_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA[validRowsTest, ], file.path(rDir, "VALID_kNN_test.csv"), quote = FALSE)
write.csv(trainDataWithNA[invalidRowsTrain, ], file.path(rDir, "INVALID_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA[invalidRowsTest, ], file.path(rDir, "INVALID_kNN_test.csv"), quote = FALSE)

#########################################################################################
### load data (check if there are still some NAs left - no)
rDir <- "/Users/marc/Documents/winton"
rDir <- "/home/ubuntu/Winton"
impType <- "INVALID"
trainDataMICE <- read.csv(file.path(rDir, paste0(impType, "_train_mice_imp_all_mean.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(trainDataMICE)
testDataMICE <- read.csv(file.path(rDir, paste0(impType, "_test_mice_imp_all_mean.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(testDataMICE)
trainDataKNN <- read.csv(file.path(rDir, paste0(impType, "_kNN_train.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(trainDataKNN)
testDataKNN <- read.csv(file.path(rDir, paste0(impType, "_kNN_test.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(testDataKNN)
sum(apply(trainDataMICE, 1, function(x) sum(is.na(x)) > 0))
sum(apply(testDataMICE, 1, function(x) sum(is.na(x)) > 0))
sum(apply(trainDataKNN, 1, function(x) sum(is.na(x)) > 0))
sum(apply(testDataKNN, 1, function(x) sum(is.na(x)) > 0))
# toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")

#########################################################################################
### check scatterplots by eye
contVars <- setdiff(colnames(testDataMICE), toFactor)
preproDataMICE <- f.normalize.quantiles(trainDataMICE, testDataMICE, contVars, quant = 0.0001)
preproDataKNN <- f.normalize.quantiles(trainDataKNN, testDataKNN, contVars, quant = 0.0001)
for (ret in c("Ret_PlusOne", "Ret_PlusTwo")) {
  for (fea in paste0("Feature_", 10:25)) {
    tiff(file.path(rDir, "ret_vs_fea", paste0("validRows_raw_mice_knn_", ret, "_vs_", fea, ".tiff")), type = "Xlib", compression = "lzw", width = 1500, height = 1000, pointsize = 20)
    if (fea %in% paste0("Feature_", c(10, 16))) {
      f.compare.ret.and.fea.no.dens(trainDataMICE, trainDataKNN, ret, fea)
    } else{
      f.compare.ret.and.fea(trainDataMICE, trainDataKNN, ret, fea)
    }
    dev.off()
    tiff(file.path(rDir, "ret_vs_fea", paste0("validRows_quantNorm_mice_knn_", ret, "_vs_", fea, ".tiff")), type = "Xlib", compression = "lzw", width = 1500, height = 1000, pointsize = 20)
    if (fea %in% paste0("Feature_", c(10, 16))) {
      f.compare.ret.and.fea.no.dens(preproDataMICE$train, preproDataKNN$train, ret, fea)
    } else{
      f.compare.ret.and.fea(preproDataMICE$train, preproDataKNN$train, ret, fea)
    }
    dev.off()
  }
}

#########################################################################################
### predictions - in total we have now:
# data sets: VALID and INVALID
# imputations: MICE (take mean) and kNN
# normalizaiton: preProcess() and f.quantile.normalization()
# methods: cubist, ranger, rpart
# this will all run on different VMs...
jobs <- list()
jobs$MICE_preProcess_cubist <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "cubist", evalMet = "boot")
jobs$MICE_preProcess_randomForest <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "rf", evalMet = "boot")
jobs$MICE_preProcess_rpart <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "rpart", evalMet = "repeatedcv")
jobs$MICE_quantNorm_cubist <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "cubist", evalMet = "boot")
jobs$MICE_quantNorm_randomForest <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "rf", evalMet = "boot")
jobs$MICE_quantNorm_rpart <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "rpart", evalMet = "repeatedcv")
jobs$KNN_preProcess_cubist <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "cubist", evalMet = "boot")
jobs$KNN_preProcess_randomForest <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "rf", evalMet = "boot")
jobs$KNN_preProcess_rpart <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE,  modMet = "rpart", evalMet = "repeatedcv")
jobs$KNN_quantNorm_cubist <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "cubist", evalMet = "boot")
jobs$KNN_quantNorm_randomForest <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "rf", evalMet = "boot")
jobs$KNN_quantNorm_rpart <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE,  modMet = "rpart", evalMet = "repeatedcv")

nrow(trainDataMICE)
nrow(testDataMICE)
nrow(trainDataKNN)
nrow(testDataKNN)

#########################################################################################
### predictions
jobList <- c("MICE_preProcess_cubist")
jobList <- c("MICE_preProcess_randomForest")
jobList <- c("MICE_quantNorm_cubist")
jobList <- c("MICE_quantNorm_randomForest")
jobList <- c("KNN_preProcess_cubist")
jobList <- c("KNN_preProcess_randomForest")
jobList <- c("KNN_quantNorm_cubist")
jobList <- c("KNN_quantNorm_randomForest")
jobList <- grep("^MICE_preProcess", names(jobs), value = TRUE)[1:2]
jobList <- grep("^MICE_quantNorm", names(jobs), value = TRUE)[1:2]
jobList <- grep("^KNN_preProcess_", names(jobs), value = TRUE)[1:2]
jobList <- grep("^KNN_quantNorm_", names(jobs), value = TRUE)[1:2]
jobList <- grep("rpart$", names(jobs), value = TRUE)

for (currentJob in jobList) {
  trainData <- jobs[[currentJob]]$train
  testData <- jobs[[currentJob]]$test
  doPrep <- jobs[[currentJob]]$doPreprocess
  useQua <- jobs[[currentJob]]$useQuantileNormalization
  modMet <- jobs[[currentJob]]$modMet
  evaMet <- jobs[[currentJob]]$evalMet
  toFact <- paste0("Feature_", c(1,5,10,13,16,20))
  cat("doing interday predictions...\n")    
  preparedData <- f.prepare.for.models(trainData, testData, "inter", doPreprocess = doPrep, toFactor = toFact, noInt = TRUE, useQuantileNormalization = useQua)
  scoresInter <- c()
  predictions <- list()
  for (toPredict in c("Ret_PlusOne", "Ret_PlusTwo")) {
    cat("doing", currentJob, toPredict, "...\n")
    tempRes <- f.inter.day.predictions(preparedData$train, preparedData$test, toPredict, preparedData$vars, modMet, evaMet)
    predictions[[toPredict]] <- tempRes$pred
    scoresInter <- c(scoresInter, tempRes$score)
  }
  out <- do.call("cbind", predictions); colnames(out) <- c("Ret_PlusOne", "Ret_PlusTwo"); rownames(out) <- rownames(testData)
  write.table(scoresInter, file.path(rDir, paste0("scores_interday_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
  write.csv(out, file.path(rDir, paste0("interday_", impType, "_", currentJob, ".csv")), quote = FALSE)
  tiff(file.path(rDir, paste0("predCheck_", impType, "_", currentJob, ".tiff")), width = 600, height = 300)
  layout(matrix(1:2, nrow = 1))
  f.histogram(predictions$Ret_PlusOne)
  f.histogram(predictions$Ret_PlusTwo)
  dev.off()
  cat("doing intraday predictions...\n")    
  preparedData <- f.prepare.for.models(trainData, testData, "intra", doPreprocess = doPrep, toFactor = toFact, noInt = TRUE, useQuantileNormalization = useQua)
  scoresIntra <- c()
  for (timePointToPredict in 121:180) {
    cat("doing", currentJob, paste0("Ret_pred_", timePointToPredict), "...\n")
    preparedData <- f.on.the.fly.intra.day.predictions(preparedData$train, preparedData$test, timePointToPredict, preparedData$vars, memory = 30, win = 6, 
                                                       modMet = modMet, evaMet = evaMet, useQuantileNormalization = useQua)
    scoresIntra <- c(scoresIntra, preparedData$score)
    write.table(preparedData$test[, paste0("Ret_pred_", timePointToPredict)], file.path(rDir, paste0("Ret_pred_", timePointToPredict, "_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
  }
  write.table(scoresIntra, file.path(rDir, paste0("scores_intraday_", timePointToPredict, "_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
  write.csv(preparedData$test[, grep("^Ret_pred", colnames(preparedData$test), value = TRUE)], file.path(rDir, paste0("intraday_", impType, "_", currentJob, ".csv")), quote = FALSE)
}

#########################################################################################
### load the predictions and merge them
impType <- "VALID"
jobList <- c("KNN_quantNorm_cubist")
predictions <- list()
scores <- list()
for (dataSet in jobList) {
  predictions[[dataSet]] <- matrix(0, nrow = 12e4, ncol = 62, dimnames = list(paste0("te_", 1:12e4), c("Ret_PlusOne", "Ret_PlusTwo", paste0("Ret_pred_", 121:180))))
  for (impType in c("VALID", "INVALID")) {
    for (prefix in c("interday", "intraday")) {
      temp <- read.csv(file.path(rDir, "final_predictions", paste0(prefix, "_", impType, "_", dataSet, ".csv")), row.names = 1)
      for (cn in colnames(temp)) {
        predictions[[dataSet]][rownames(temp), cn] <- temp[,cn]
      }
      scores <- as.numeric(scan(file.path(rDir, "final_predictions", paste0("scores_", prefix, "_", impType, "_", dataSet, ".txt")), what = "double"))
      if (prefix == "intraday") {
        cat("discarding", sum(scores > 0), "bad fits\n")
        predictions[[dataSet]][rownames(temp), which(scores > 0)] <- 0
        cat("discarding", sum(scores < (-.05)), "too good fits\n")
        predictions[[dataSet]][rownames(temp), which(scores < (-.05))] <- 0
      }
    }
  }
}

for (dataSet in jobList) {
  out <- data.frame(
    Id = paste0(rep(1:12e4, each = 62), '_', 1:62),
    Predicted = 0,
    stringsAsFactors = FALSE
  )
  rownames(out) <- out$Id
  for (id in 1:60) {
    ret <- paste0("Ret_pred_", 120+id)
    out[paste0(1:12e4, '_', id), "Predicted"] <- predictions[[dataSet]][,ret]
  }
  out[paste0(1:12e4, '_61'),"Predicted"] <- predictions[[dataSet]][,"Ret_PlusOne"]
  out[paste0(1:12e4, '_62'),"Predicted"] <- predictions[[dataSet]][,"Ret_PlusTwo"]
  write.csv(out, file.path(rDir, "final_predictions", paste0("FIN_", dataSet, "_onlyIntra.csv")), quote = FALSE, row.names = FALSE)
}