In this competition the challenge is to predict the return of a stock, given the history of the past few days.
We provide 5-day windows of time, days D-2, D-1, D, D+1, and D+2. You are given returns in days D-2, D-1, and part of day D, and you are asked to predict the returns in the rest of day D, and in days D+1 and D+2.
During day D, there is intraday return data, which are the returns at different points in the day. We provide 180 minutes of data, from t=1 to t=180. In the training set you are given the full 180 minutes, in the test set just the first 120 minutes are provided.
For each 5-day window, we also provide 25 features, Feature_1 to Feature_25. These may or may not be useful in your prediction.
Each row in the dataset is an arbitrary stock at an arbitrary 5 day time window.
And apropos originally… the stuff below is already quite a clean-up :D
Below are the correlations between the past returns in the test data:
Now for the “outlier” training data:
And finally for the cleaned-up training data:
################################################################################################################
### some functions (sry - had some of them at hand)
f.yellowredblue <- function(x) {
r <- approx(c(0, 0.5, 1), c(1, 1, 0), n = x)$y
g <- approx(c(0, 0.5, 1), c(1, 0, 0), n = x)$y
b <- approx(c(0, 0.5, 1), c(0, 0, 1), n = x)$y
return(rgb(r, g, b))
}
f.blueredyellow <- function(x) {
return(rev(f.yellowredblue(x)))
}
f.comp <- function(x,rt){
if(rt==0){y <- x>0}
else{y <- x>=rt}
return(y)
}
f.generic.correlation.matrix <- function(data, rDir, outfile, corMethod = "pearson", useOnlyHighVar = FALSE, tryAutoColor = TRUE, highVarPercentage = 0.9, corUse = "everything") {
require("gplots")
if (useOnlyHighVar) {
varVec <- apply(data, 1, var)
lowerBound <- quantile(varVec, highVarPercentage)
data <- data[varVec>=lowerBound,]
}
sampleCorMat <- cor(data, method = corMethod, use = corUse)
# some plotting parameters
minCor <- min(sampleCorMat, na.rm = TRUE)
maxCor <- max(sampleCorMat[sampleCorMat != 1], na.rm = TRUE)
if (sum(is.na(c(minCor, maxCor))) > 0) { tryAutoColor <- FALSE }
if (tryAutoColor) {
if (minCor > 0.8) { minCor <- 0.8 } else { if (minCor > 0.7) { minCor <- 0.7 } else { if (minCor > 0.5) { minCor <- 0.5 }}}
if (maxCor > 0.8) { maxCor <- 1 } else { if (maxCor < 0.8) { maxCor <- 0.8 } else { if (maxCor < 0.7) { maxCor <- 0.7 } else { if (maxCor < 0.5) { maxCor <- 0.5 } }}}
}
numCols <- ceiling((maxCor-minCor)*50)
tiff(file.path(rDir, paste(outfile, "_cor_mat.tiff", sep = '')), width = 1600, height = 1600, compression = "lzw", type = "Xlib")
labelCex <- 1 + 1/log10(nrow(sampleCorMat))
noteCex <- 4
if (nrow(sampleCorMat) > 6) { noteCex <- 3 }
if (nrow(sampleCorMat) > 12) { noteCex <- 2 }
if (nrow(sampleCorMat) > 18) { noteCex <- 1 }
if (nrow(sampleCorMat) > 30) { #no notes
heatmap.2(sampleCorMat, col = f.blueredyellow(numCols), trace="none", scale = "none", margins = c(15,15), breaks = seq(minCor, maxCor, length.out = numCols+1))
} else {
heatmap.2(sampleCorMat, col = f.blueredyellow(numCols), trace="none", scale = "none", margins = c(15,15), cellnote = round(sampleCorMat,2), notecol = "black", notecex = noteCex, breaks = seq(minCor, maxCor, length.out = numCols+1), cexRow = labelCex, cexCol = labelCex)
}
dev.off()
}
f.histogram <- function(x, y = c(), xName = "", useFreq = FALSE, doNotShowSummary = FALSE, useLog = FALSE, ...)
{
# this is a normal histogram mode (not bars, but frequency or density)
# x: values
# xName: x-axis label
# useFreq: will plot frequencies instead of densities
if (length(x) == length(y)) {
toPlot <- list(x = x, y = y)
class(toPlot) <- "density"
} else {
toPlot <- density(x)
}
if (useFreq) {
toPlot$y <- toPlot$y * length(x)
}
if (useLog) {
if (useFreq) { toPlot$y <- log2(toPlot$y+1) }
else { cat("log only possible for useFreq = TRUE\n") }
}
par(mar = c(5,4,1,1))
plot(
toPlot,
#main = "",
bty = "n",
xaxs = "r",
yaxs = "r",
xlab = xName,
ylab = "",
las = 1,
cex = 0.4,
tck = 0.01,
...
)
if (!doNotShowSummary) {
text(
par("usr")[1] + par("usr")[2]/15,
par("usr")[4] - par("usr")[4]/15,
adj = c(0,1),
labels = paste(c("median:", "mean:", "sd:"),
c(round(median(x), digits = 3),
round(mean(x), digits = 3),
round(sd(x), digits = 3)),
collapse = "\n", sep=" ")
)
}
}
f.scatter <- function(x, y, xName = "", yName = "", cexFactor = 1, ...)
{
# this will draw a normal scatterplot
# x: x values
# y: y values
# xName: x-axis label
# yName: y-axis label
par(mar = c(5,5,1,1))
plot(
y ~ x,
#main = "",
bty = "n",
xaxs = "r",
yaxs = "r",
xlab = xName,
ylab = yName,
las = 1,
cex = 0.2*cexFactor,
tck = 0.01,
pch = 19,
...
)
text(
par("usr")[1] + par("usr")[2]/15,
par("usr")[4] - par("usr")[4]/15,
adj = c(0,1),
labels = paste(c("corP:", "corS:", "n:"),
c(round(cor(x,y,method="pearson"), digits = 3),
round(cor(x,y,method="spearman"), digits = 3),
length(x)),
collapse = "\n", sep=" ")
)
}
f.genetodensity <- function(x,y) {
require("MASS")
est <- kde2d(x, y, n = 50) # needs MASS - calculates the two dimensional density
if (sum(is.na(est$z)) > 0) {
est <- kde2d(x, y, n = 50, h = c(bw.nrd0(x), bw.nrd0(y))) # needs MASS - calculates the two dimensional density
}
dvec <- as.vector(est$z) # note: as.vector takes columnwise, rows correspond to the value of x, columns to the value of y - the y index counts therefore 50 times
xind <- findInterval(x, est$x) # find the intervals to which a gene belongs
yind <- findInterval(y, est$y) # find the intervals to which a gene belongs
vecind <- (yind-1)*50 + xind # this is a vector with indices for dvec.
dens <- dvec[vecind]
names(dens) <- names(x)
return(dens)
}
f.genetodensitycolor <- function(x,y) {
require("MASS")
require("colorRamps")
## NOTE would not work if two spots have exactly the same density or?
## important for a nice picture is that one removes the grid points with no gene (the ones with the lowest density)
grids <- 100
out <- list()
est <- kde2d(x, y, n = grids) # needs MASS - calculates the two dimensional density
if (sum(is.na(est$z)) > 0) {
est <- kde2d(x, y, n = grids, h = c(bw.nrd0(x), bw.nrd0(y))) # needs MASS - calculates the two dimensional density
}
dvec <- as.vector(est$z) # note: as.vector takes columnwise, rows correspond to the value of x, columns to the value of y - the y index counts therefore grids times
xind <- findInterval(x, est$x) # find the intervals to which a gene belongs
yind <- findInterval(y, est$y) # find the intervals to which a gene belongs
vecind <- (yind-1)*grids + xind # this is a vector with indices for dvec.
dens <- dvec[vecind]
dens_with_genes <- unique(dens)
names(dens) <- names(x)
out$dens <- dens
out$denschar <- as.vector(dens, mode = "character")
colgrad <- blue2red(length(dens_with_genes))
#colgrad <- rainbow(length(dens_with_genes), start = 0, end = 1, alpha = 0.8)
#colgrad <- heat.colors(length(dens_with_genes), alpha = 0.8)
#colgrad <- terrain.colors(length(dens_with_genes), alpha = 0.8)
#colgrad <- topo.colors(length(dens_with_genes), alpha = 0.8)
#colgrad <- cm.colors(length(dens_with_genes), alpha = 0.8)
sdens_with_genes <- sort(dens_with_genes, decreasing = FALSE)
names(colgrad) <- sdens_with_genes
out$cols <- colgrad
genecols <- colgrad[as.vector(dens, mode = "character")]
names(genecols) <- names(x)
out$genecols <- genecols
return(out)
}
f.replace.Inf.by.next <- function(x) {
if (sum(abs(x) == Inf, na.rm = TRUE) == 0) {return(x)}
cat("replacing Inf/-Inf with the next highest/lowest value\n")
notNA <- !is.na(x)
minInf <- x == -Inf
pluInf <- x == Inf
nx <- x[notNA&(!minInf)&(!pluInf)]
x[notNA&pluInf] <- max(nx)
x[notNA&minInf] <- min(nx)
return(x)
}
f.density.scatter <- function(x, y, xName = "", yName = "", main = "", tryToScale = FALSE, ...) {
x <- f.replace.Inf.by.next(x)
y <- f.replace.Inf.by.next(y)
if (tryToScale) {
if (max(x) > 10*median(x)) {
isExtreme <- x > 10*median(x)
cat("limiting data to 10*median(x) - i.e. replacing", sum(isExtreme), "in percent:", mean(isExtreme)*100," extreme values\n")
x[isExtreme] <- 10*median(x)
}
if (max(y) > 10*median(y)) {
isExtreme <- y > 10*median(y)
cat("limiting data to 10*median(y) - i.e. replacing", sum(isExtreme), "in percent:", mean(isExtreme)*100," extreme values\n")
y[isExtreme] <- 10*median(y)
}
}
# xLimits <- range(x)
# if (max(x) > 10*median(x)) {
# cat("limiting plot to a quantile(x, 0.99)\n")
# xLimits[2] <- quantile(x, 0.99)
# }
# yLimits <- range(y)
# if (max(y) > 10*median(y)) {
# cat("limiting plot to a quantile(y, 0.99)\n")
# yLimits[2] <- quantile(y, 0.99)
# }
colorGrad <- f.genetodensitycolor(x, y)
# f.scatter(x, y, xName, yName, cexFactor = 1, col = colorGrad$genecols, main = main, xlim = xLimits, ylim = yLimits)
f.scatter(x, y, xName, yName, cexFactor = 1, col = colorGrad$genecols, main = main, ...)
return(NULL)
}
f.allinone <- function(data, rdir, rt = 0, prefix = "", useSVG = FALSE, onWin = FALSE, scaleToSample = FALSE)
{
# this function draws scatterplots, histograms and sorted scatters
# x <dataframe/matrix>: values, one sample in one columns
# rdir <directory>: folder for the figure
# rt <number>: a threshold
# prefix <string>: a word that will be added to the filename
# useSVG <bool>: tells whether to draw an SVG or a TIFF
# onWin <bool>: use this if you use Windows
hits.union <- rownames(data)[which(apply(f.comp(data,rt),1,sum) > 0)]
data <- data[hits.union,]
# get sorted data and make sure everything is a data frame
s.data <- apply(data,2,sort)
data <- as.data.frame(data)
s.data <- as.data.frame(s.data)
# some variables
ns = ncol(data)
sn = colnames(data)
# use CairoSVG if you are on windows
afname <- paste(prefix,'_allinone_',rt,'_',paste(sn,collapse='_'),sep='')
if (nchar(afname) > 50) { afname <- paste(prefix,'_allinone_',rt,'_many',sep='') }
if (useSVG) {
#print("use: rsvg-convert -a -d 300 -p 300 svgfile > pngfile")
afpath <- file.path(rdir,paste(afname, '.svg', sep = ''))
if (onWin) { CairoSVG(file = afpath) }
else { svg(filename = afpath, onefile = TRUE, bg = FALSE, antialias = "default") }
} else {
afpath <- file.path(rdir,paste(afname, '.tiff', sep = ''))
tiff(file=afpath, width = 600*ns, height = 600*ns, units = "px", pointsize = 20, compression = "lzw", bg = "white", type = "Xlib")
}
# plot everything
layout(mat=matrix(c(1:(ns*ns)), nrow=ns, byrow = TRUE))
cat(paste("plotting", ns, "rows\n"))
cat(paste(c("|", rep(' ', ns),"|\n|"),sep='',collapse=''))
for (i in c(1:ns))
{
for (k in c(1:ns))
{
if (scaleToSample) {
xMin <- floor(min(data[,k]))
xMax <- ceiling(max(data[,k]))
yMin <- floor(min(data[,i]))
yMax <- ceiling(max(data[,i]))
} else {
xMin <- floor(min(data))
xMax <- ceiling(max(data))
yMin <- floor(min(data))
yMax <- ceiling(max(data))
}
if (i == k)
{
# histogram in the diagonal
f.histogram(data[f.comp(data[,k],rt),k], xName = sn[k], xlim = c(xMin, xMax), main = "")
}
else if (i < k)
{
# scatterplot with densities on the top right
if ((length(unique(data[,k][!is.na(data[,k])])) > 2) & (length(unique(data[,i][!is.na(data[,i])])) > 2)) {
colorGrad <- f.genetodensitycolor(data[,k], data[,i])
} else {
colorGrad <- list(genecols = rep("black", nrow(data)))
}
f.scatter(data[,k], data[,i], sn[k], sn[i], cexFactor = 1.5, col = colorGrad$genecols, xlim = c(xMin, xMax), ylim = c(yMin, yMax), main = "")
abline(0,1,col="black")
}
else if (i > k)
{
# sorted scatterplot on the bottom left
f.scatter(s.data[,k], s.data[,i], sn[k], sn[i], cexFactor = 1.5, xlim = c(xMin, xMax), ylim = c(yMin, yMax), main = "")
abline(0,1,col="darkorchid3")
}
}
cat("#")
}
cat('|\n')
dev.off()
if (useSVG) {
system(paste("rsvg-convert -a -d 300 -p 300 ", afpath," > ", afpath, ".png", sep = ''))
#system(paste("rm ", afpath, sep = ''))
}
}
f.compare.ret.and.fea <- function(trainData, testData, ret, fea) {
trainData$abs <- abs(trainData[[ret]])
trainData$sig <- sign(trainData[[ret]])*rnorm(nrow(trainData), .5, .1)
testData$abs <- abs(testData[[ret]])
testData$sig <- sign(testData[[ret]])*rnorm(nrow(testData), .5, .1)
trainData <- trainData[(!is.na(trainData[[ret]])) & (!is.na(trainData[[fea]])),]
testData <- testData[(!is.na(testData[[ret]])) & (!is.na(testData[[fea]])),]
layout(matrix(1:6, nrow = 2, byrow = TRUE))
f.density.scatter(as.numeric(trainData[[fea]]), trainData[[ret]], fea, ret, "return")
f.density.scatter(as.numeric(trainData[[fea]]), trainData$abs, fea, ret, "abs(return)")
f.density.scatter(as.numeric(trainData[[fea]]), trainData$sig, fea, ret, "sign(return)")
f.density.scatter(as.numeric(testData[[fea]]), testData[[ret]], fea, ret, "return")
f.density.scatter(as.numeric(testData[[fea]]), testData$abs, fea, ret, "abs(return)")
f.density.scatter(as.numeric(testData[[fea]]), testData$sig, fea, ret, "sig(return)")
return(NULL)
}
f.compare.ret.and.fea.no.dens <- function(trainData, testData, ret, fea) {
trainData$abs <- abs(trainData[[ret]])
trainData$sig <- sign(trainData[[ret]])*rnorm(nrow(trainData), .5, .1)
testData$abs <- abs(testData[[ret]])
testData$sig <- sign(testData[[ret]])*rnorm(nrow(testData), .5, .1)
trainData <- trainData[(!is.na(trainData[[ret]])) & (!is.na(trainData[[fea]])),]
testData <- testData[(!is.na(testData[[ret]])) & (!is.na(testData[[fea]])),]
layout(matrix(1:6, nrow = 2, byrow = TRUE))
f.scatter(as.numeric(trainData[[fea]]), trainData[[ret]], fea, ret, cexFactor = 1, col = "black", main = "return")
f.scatter(as.numeric(trainData[[fea]]), trainData$abs, fea, ret, cexFactor = 1, col = "black", main = "abs(return)")
f.scatter(as.numeric(trainData[[fea]]), trainData$sig, fea, ret, cexFactor = 1, col = "black", main = "sign(return)")
f.scatter(as.numeric(testData[[fea]]), testData[[ret]], fea, ret, cexFactor = 1, col = "black", main = "return")
f.scatter(as.numeric(testData[[fea]]), testData$abs, fea, ret, cexFactor = 1, col = "black", main = "abs(return)")
f.scatter(as.numeric(testData[[fea]]), testData$sig, fea, ret, cexFactor = 1, col = "black", main = "sig(return)")
return(NULL)
}
f.get.black.list <- function(train, test, featuresToCheck) {
cat("checking features for difference between train and test\n")
checks <- matrix(0, nrow = length(featuresToCheck), ncol = 3, dimnames = list(featuresToCheck, c("est", "pVal", "fdr")))
for (thing in featuresToCheck) {
temp <- t.test(as.numeric(train[[thing]]), as.numeric(test[[thing]]), na.action = na.omit)
checks[thing, "est"] <- temp$statistic
checks[thing, "pVal"] <- temp$p.value
checks[thing, "fdr"] <- p.adjust(temp$p.value, method = "BH")
}
out <- rownames(checks)[checks[, "fdr"] < 0.01]
print(checks[out,])
return(out)
}
f.simple.imputation.with.nearest.neighbors <- function(data, nnq = 0.01, impFun = median) {
require("cluster")
dsm <- NULL
nnm <- NULL
dsmCount <- 0
nnmCount <- 0
while (sum(c(is.null(dsm), is.null(nnm))) > 0) {
if (is.null(dsm)) {
dsmCount <- dsmCount + 1
cat("calculating dissimilarity...", dsmCount, "\n")
dsm <- as.matrix(daisy(data, "gower"))
}
if (is.null(nnm)) {
nnmCount <- nnmCount + 1
cat("searching nearest neighbors...", nnmCount, "\n")
if (nnmCount > 1) { nnq <- nnq + 0.1*nnq }
nnm <- t(apply(dsm, 1, function(x) which(x < quantile(x, nnq)))) # it includes itself, but actually no problem
}
}
cat("replacing NAs\n")
for (i in 1:nrow(data)) {
if ((i %% 1000) == 0) { cat('#') }
if ((i %% 10000) == 0) { cat('\n') }
mask <- is.na(data[i,])
if (sum(mask) == 0) { next }
if (sum(mask) == 1) {
data[i, mask] <- impFun(data[nnm[i,],mask], na.rm = TRUE)
} else {
data[i, mask] <- apply(data[nnm[i,],mask], 2, function(x) impFun(x, na.rm = TRUE))
}
}
cat('\n')
return(data)
}
f.simple.imputation.with.nearest.neighbors.sampled <- function(data, nnq = 0.01, impFun = mean, iter = 10, splitInto = 4, numCores = splitInto) {
require("parallel")
temp <- array(0, c(nrow(data), ncol(data), iter), list(rownames(data), colnames(data), paste0("iter_", 1:iter)))
for (i in 1:iter) {
splitBy <- sample.int(splitInto, nrow(data), replace = TRUE)
subSets <- split(data, splitBy)
subSetsImp <- mclapply(subSets, function(x) f.simple.imputation.with.nearest.neighbors(x, nnq, impFun), mc.cores = numCores)
#subSetsImp <- do.call("rbind", subSetsImp)
#rownames(subSetsImp) <- sapply(rownames(subSetsImp), function(x) unlist(strsplit(x, '.', TRUE))[2])
for (ssi in subSetsImp) {
tt <- as.matrix(ssi)
temp[rownames(tt), colnames(tt), paste0("iter_", i)] <- tt
}
}
out <- apply(temp, c(1,2), function(x) median(as.numeric(x), na.rm = TRUE))
return(out)
}
f.simple.imputation.with.nearest.neighbors.sampled.VIM <- function(data, knn = 6, iter = 10, splitInto = 8, numCores = 4) {
require("parallel")
require("VIM")
out <- array(0, c(nrow(data), ncol(data), iter), list(rownames(data), colnames(data), paste0("iter_", 1:iter)))
for (i in 1:iter) {
temp <- matrix(0, nrow(data), ncol(data), dimnames = list(rownames(data), colnames(data)))
splitBy <- sample.int(splitInto, nrow(data), replace = TRUE)
subSets <- split(data, splitBy)
subSetsImp <- mclapply(subSets, function(x) kNN(x, k = knn, numFun = mean), mc.cores = numCores)
for (ssi in subSetsImp) {
for (cn in colnames(temp)) {
temp[rownames(ssi), cn] <- ssi[, cn]
}
}
out[,,i] <- temp
}
#out <- apply(out, c(1,2), function(x) median(as.numeric(x), na.rm = TRUE))
return(out)
}
f.checkDupliBool <- function(x) {
x <- x[!is.na(x)]
temp <- table(x)
if (sum(temp>1) != 0) {
out <- TRUE
} else {
out <- FALSE
}
}
f.checkDupli <- function(x) {
x <- x[!is.na(x)]
temp <- table(x)
dupliCount <- sum(temp>1)
return(dupliCount)
}
f.checkDupliVal <- function(x) {
x <- x[!is.na(x)]
temp <- table(x)
dupliCount <- sum(temp>1)
if (dupliCount == 0) { out <- NA}
else if (dupliCount == 1) { out <- as.numeric(names(temp)[temp > 1])}
else { out <- paste(names(temp)[temp > 1], collapse = '|') }
return(out)
}
f.filter.handpicked <- function(data) {
trainMask_D0 <- abs(data$Ret_D0) < 4e-4; trainMask_D0[is.na(trainMask_D0)] <- TRUE; print(sum(!trainMask_D0))
trainMask_MO <- abs(data$Ret_MinusOne) < .2; trainMask_MO[is.na(trainMask_MO)] <- TRUE; print(sum(!trainMask_MO))
trainMask_MT <- abs(data$Ret_MinusTwo) < .2; trainMask_MT[is.na(trainMask_MT)] <- TRUE; print(sum(!trainMask_MT))
trainMask_F3 <- data$Feature_3 > -2; trainMask_F3[is.na(trainMask_F3)] <- TRUE; print(sum(!trainMask_F3))
trainMask_F4 <- data$Feature_4 > -4; trainMask_F4[is.na(trainMask_F4)] <- TRUE; print(sum(!trainMask_F4))
trainMask_F17 <- data$Feature_17 < 2; trainMask_F17[is.na(trainMask_F17)] <- TRUE; print(sum(!trainMask_F17))
trainMask_F18 <- data$Feature_18 > -1.3; trainMask_F18[is.na(trainMask_F18)] <- TRUE; print(sum(!trainMask_F18))
trainMask_F22 <- data$Feature_22 > -4.5; trainMask_F22[is.na(trainMask_F22)] <- TRUE; print(sum(!trainMask_F22))
trainMask_F23 <- data$Feature_23 > -2; trainMask_F23[is.na(trainMask_F23)] <- TRUE; print(sum(!trainMask_F23))
trainMask_F25 <- data$Feature_25 < 1; trainMask_F25[is.na(trainMask_F25)] <- TRUE; print(sum(!trainMask_F25))
finalMask <- trainMask_D0 & trainMask_MO & trainMask_MT & trainMask_F3 & trainMask_F4 & trainMask_F22 & trainMask_F17 & trainMask_F18 & trainMask_F23 & trainMask_F25
out <- data[finalMask,]
return(out)
}
f.normalize.quantiles <- function(train, test, colsToNorm, quant = 1e-3) {
for (colToNorm in colsToNorm) {
trainQuants <- sort(quantile(train[[colToNorm]], seq(0, 1, quant), na.rm = TRUE)) # in some rare cases, two quantiles are almost equal but it complains about the sorting
testQuants <- sort(quantile(test[[colToNorm]], seq(0, 1, quant), na.rm = TRUE))
trainMap <- findInterval(train[[colToNorm]], trainQuants)
testMap <- findInterval(test[[colToNorm]], testQuants)
train[[colToNorm]] <- testQuants[trainMap]
test[[colToNorm]] <- testQuants[testMap]
}
out <- list(train = train, test = test)
return(out)
}
f.stahel.trafo <- function(x) {
med <- median(x, na.rm = TRUE)
qua <- quantile(x, 0.25, na.rm = TRUE)
con <- med/((med/qua)^2.9)
out <- log(x + con)
return(out)
}
f.get.additional.variables <- function(data, blackList = c(), noInt = FALSE) {
require("caTools")
retCols <- grep("^Ret", colnames(data), value = TRUE)
feaCols <- grep("^Feature", colnames(data), value = TRUE)
weiCols <- grep("^Weight", colnames(data), value = TRUE)
pastRet <- paste0("Ret_", c("MinusTwo", "MinusOne", 2:120))
futuRet <- paste0("Ret_", c(121:180, "PlusOne", "PlusTwo"))
originalCols <- colnames(data)
returnsToSimplify <- paste0("Ret_", 2:120)
if (length(blackList)) {
returnsToSimplify <- setdiff(returnsToSimplify, blackList)
feaCols <- setdiff(feaCols, blackList)
}
data$Ret_mean <- apply(data[,returnsToSimplify], 1, function(x) mean(x, na.rm = TRUE))
data$Ret_median <- apply(data[,returnsToSimplify], 1, function(x) median(x, na.rm = TRUE))
data$Ret_range <- apply(data[,returnsToSimplify], 1, function(x) max(x, na.rm = TRUE)-min(x, na.rm = TRUE))
data$Ret_sumAbs <- apply(data[,returnsToSimplify], 1, function(x) sum(abs(x), na.rm = TRUE))
data$Ret_var <- apply(data[,returnsToSimplify], 1, function(x) var(x, na.rm = TRUE))
data$Ret_pos <- apply(data[,returnsToSimplify], 1, function(x) sum(x > 0, na.rm = TRUE))
data$Ret_sum <- apply(data[,returnsToSimplify], 1, function(x) sum(x, na.rm = TRUE))
data$Ret_sum_scaled <- data$Ret_sum/2*3
data$Ret_past_mean <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) mean(x, na.rm = TRUE))
data$Ret_past_sumAbs <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) sum(abs(x), na.rm = TRUE))
data$Ret_past_range <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) max(x, na.rm = TRUE)-min(x, na.rm = TRUE))
data$Ret_past_var <- apply(cbind(data$Ret_MinusTwo,data$Ret_MinusOne,data$Ret_sum/2*3), 1, function(x) var(x, na.rm = TRUE))
data$Ret_past_trend <- apply(data[,intersect(paste0("Ret_", 2:120), returnsToSimplify)], 1, function(x) sum(x, na.rm = TRUE))
data$Ret_past_mean_trend <- apply(data[,intersect(paste0("Ret_", 2:120), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
data$Ret_mean_trend_10 <- apply(data[,intersect(paste0("Ret_", 2:10), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
data$Ret_mean_trend_15 <- apply(data[,intersect(paste0("Ret_", 6:15), returnsToSimplify)], 1, function(x) mean(x, na.rm = TRUE))
for (i in seq(20, 120, by = 5)) {
toSummarize <- intersect(paste0("Ret_", (i-10+1):i), returnsToSimplify)
if (length(toSummarize) > 3) {
data[[paste0("Ret_mean_trend_", i)]] <- apply(data[,], 1, function(x) mean(x, na.rm = TRUE))
}
}
# if ("Ret_121" %in% colnames(data)) {
# data$Ret_mean_trend <- apply(data[,paste0("Ret_", 121:180)], 1, function(x) mean(x, na.rm = TRUE))
# data$Ret_mean_trend_130 <- apply(data[,paste0("Ret_", 121:130)], 1, function(x) mean(x, na.rm = TRUE))
# data$Ret_mean_trend_135 <- apply(data[,paste0("Ret_", 126:135)], 1, function(x) mean(x, na.rm = TRUE))
# for (i in seq(140, 180, by = 5)) {
# data[[paste0("Ret_past_mean_trend_", i)]] <- apply(data[,paste0("Ret_", (i-10+1):i)], 1, function(x) mean(x, na.rm = TRUE))
# }
# }
#
simplifiedReturns <- setdiff(colnames(data), originalCols)
# all feature interactions (pairwise multiplied)
if (!noInt) {
doubleInt <- combs(feaCols, 2)
for (i in 1:nrow(doubleInt)) {
curF <- doubleInt[i,]
data[[paste0("int_", paste(curF, collapse = "X"))]] <- data[[curF[1]]]*data[[curF[2]]]
}
}
# or add the triple feature interactions
# tripleInt <- combs(feaCols, 3)
# for (i in 1:nrow(tripleInt)) {
# curF <- tripleInt[i,]
# data[[paste(curF, collapse = "X")]] <- data[[curF[1]]]*data[[curF[2]]]*data[[curF[3]]]
# }
# and add the interactions of the features with all the past returns paste0(fea, "X", pastRet) >> ctrl-a-0 - also not too splendid (there are too many single past return points)
# for (fea in feaCols) {
# for (pas in pastRet) {
# data[[paste0(fea, "X", pas)]] <- data[[fea]]*data[[pas]]
# }
# }
# the same for the simplified returns >> ctrl-a-1 - not so great - how about only this
# for (fea in feaCols) {
# for (sim in simplifiedReturns) {
# data[[paste0(fea, "X", sim)]] <- data[[fea]]*data[[sim]]
# }
# }
out <- list(
data = data,
simplifiedReturns = simplifiedReturns
)
if (!noInt) {
out$dointCols = apply(doubleInt, 1, function(x) paste0("int_", paste(x, collapse = 'X')))
# out$trintCols = apply(tripleInt, 1, function(x) paste(x, collapse = 'X')),
# out$pasIntCols = paste0(rep(feaCols, each = length(pastRet)), "X", pastRet),
# out$simIntCols = paste0(rep(feaCols, each = length(simplifiedReturns)), "X", simplifiedReturns)
}
return(out)
}
f.prepare.for.models <- function(train, test, type, doPreprocess = TRUE, toFactor = c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20"), addAnyway = c(), blackList = c(), noInt = FALSE, useQuantileNormalization = FALSE) {
require("caret")
#########################################################################################
### some things for data access
retCols <- grep("^Ret", colnames(train), value = TRUE)
feaCols <- grep("^Feature", colnames(train), value = TRUE)
weiCols <- grep("^Weight", colnames(train), value = TRUE)
pastRet <- paste0("Ret_", c("MinusTwo", "MinusOne", 2:120))
futuRet <- paste0("Ret_", c(121:180, "PlusOne", "PlusTwo"))
#########################################################################################
### calculate some additional variables
toFactor <- setdiff(toFactor, blackList)
if (type != "intra") {
cat("calculating additional variables...\n")
trainDataAndVars <- f.get.additional.variables(train, blackList, noInt)
testDataAndVars <- f.get.additional.variables(test, blackList, noInt)
train <- trainDataAndVars$data
test <- testDataAndVars$data
} else {
if (!noInt) {
trainDataAndVars <- f.get.additional.variables(train, blackList, noInt)
testDataAndVars <- f.get.additional.variables(test, blackList, noInt)
train <- trainDataAndVars$data
test <- testDataAndVars$data
}
}
#########################################################################################
### check for correlation with interactions and the regular features as well
# intCols <- c(pastRet, trainDataAndVars$pasIntCols, trainDataAndVars$dointCols, trainDataAndVars$trintCols, trainDataAndVars$simplifiedReturns, trainDataAndVars$simIntCols)
cat("check for correlation with target...\n")
if (type == "inter") {targetVars <- c("Ret_PlusOne", "Ret_PlusTwo")}
else if (type == "intra") {targetVars <- paste0("Ret_", 121:180)}
else {targetVars <- type}
if (type != "intra") {
intCols <- unique(c("Ret_MinusTwo", "Ret_MinusOne", trainDataAndVars$simplifiedReturns))
} else {
intCols <- c("Ret_MinusTwo", "Ret_MinusOne")
}
if (noInt) {
cutOff <- .25
} else {
intCols <- unique(c(intCols, trainDataAndVars$dointCols))
cutOff <- .75
}
intCors <- cor(train[,targetVars], train[, c(intCols, feaCols)], method = "spearman", use = "complete.obs")
if (length(targetVars) == 1) {
corSums <- intCors[1,]
} else {
corSums <- apply(abs(intCors[targetVars,]), 2, sum)
}
corSums <- corSums[setdiff(names(corSums), grep("Ret_[[:digit:]]{1,3}|_trend_", names(corSums), value = TRUE))]
intVars <- names(corSums)[corSums > quantile(corSums, cutOff)]
#########################################################################################
### convert the factors
cat("convert factors...\n")
for (ff in toFactor) {
train[[ff]] <- as.factor(train[[ff]])
test[[ff]] <- as.factor(test[[ff]])
}
#########################################################################################
### check for near zero variance and remove highly correlated features
cat("check near-zero variance and remove highly correlated predictors...\n")
nzv <- nearZeroVar(train[,intVars], saveMetrics= TRUE)
temp <- train[,setdiff(intVars, toFactor)]
descrCor <- cor(temp, use = "complete.obs")
highlyCor <- colnames(temp)[findCorrelation(descrCor, cutoff = .75)]
intVars <- setdiff(intVars, Reduce(union, list(rownames(nzv)[nzv$nzv], highlyCor)))
intVars <- unique(c(intVars, addAnyway))
#########################################################################################
### preprocess data (needs to be repeated for some vars in the intra day models)
if (sum(apply(train, 2, function(x) sum(is.na(x)))) == 0) {
if (doPreprocess) {
cat("preprocessing data...\n")
# comb <- rbind(train[, setdiff(intVars, toFactor)], test[, setdiff(intVars, toFactor)])
# preProComb <- preProcess(comb, method = c("center", "scale", "YeoJohnson"))
# tempComb <- predict(preProComb, comb)
preproTrain <- preProcess(train[, setdiff(intVars, toFactor)], method = c("center", "scale", "YeoJohnson"))
preproTest <- preProcess(test[, setdiff(intVars, toFactor)], method = c("center", "scale", "YeoJohnson"))
tempTrain <- predict(preproTrain, train[, setdiff(intVars, toFactor)])
tempTest <- predict(preproTest, test[, setdiff(intVars, toFactor)])
for (cn in setdiff(intVars, toFactor)) {
# train[[cn]] <- tempComb[rownames(train), cn]
# test[[cn]] <- tempComb[rownames(test), cn]
train[[cn]] <- tempTrain[[cn]]
test[[cn]] <- tempTest[[cn]]
}
}
} else {
cat("skip preprocessing data due to NAs...\n")
}
#########################################################################################
### preprocess with quantile normalization
if (useQuantileNormalization) {
if (doPreprocess) {stop("either use doPreprocess or useQuantileNormalization")}
contVars <- setdiff(colnames(test), toFactor)
preproData <- f.normalize.quantiles(train, test, contVars, quant = 0.0001)
train <- preproData$train
test <- preproData$test
}
#########################################################################################
### return
out <- list(train = train, test = test, vars = intVars)
return(out)
}
f.inter.day.predictions <- function(train, test, toPredict, predVars, modMet = "rpart", evaMet = "adaptive_cv") {
#########################################################################################
# cat("subsetting data...\n")
# inTrain <- createDataPartition(y = train[[toPredict]], p = .5, list = FALSE)
# trainSet <- train[inTrain,]
# testSet <- train[-inTrain,]
#########################################################################################
cat("running model with", length(predVars), "variables...\n")
predVars <- setdiff(predVars, c(toPredict))
numReps <- ifelse(modMet == "rpart", 50, 10)
myCtrl <- trainControl(method = evaMet, number = 10, repeats = numReps)
if (modMet == "ranger") {
formulaString <- paste0(toPredict, '~', paste(predVars, collapse = '+'))
myFit <- train(formula(formulaString), train[,c(toPredict, predVars)], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
} else {
myFit <- train(x = train[,predVars], y = train[[toPredict]], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
}
predValsForScore <- predict(myFit, train)
baseLine <- f.WMAE(train$Weight_Daily, train[[toPredict]], rep(0, nrow(train)))
meanLine <- f.WMAE(train$Weight_Daily, train[[toPredict]], mean(train[[toPredict]], na.rm = TRUE))
diffToNoPred <- f.WMAE(train$Weight_Daily, train[[toPredict]], predValsForScore) - baseLine
diffToNoPredMean <- f.WMAE(train$Weight_Daily, train[[toPredict]], predValsForScore) - meanLine
cat("Difference to zero prediction:", diffToNoPred, "... in percent:", diffToNoPred/baseLine*100, "\n")
cat("Difference to mean prediction:", diffToNoPredMean, "... in percent:", diffToNoPredMean/meanLine*100, "\n")
predValsForSubmission <- predict(myFit, test)
#########################################################################################
out <- list(mod = myFit, pred = predValsForSubmission, score = diffToNoPred/baseLine*100)
return(out)
}
f.get.additional.variables.intra.test <- function(data, timePointToPredict, memory = 30, win = 10, blackList = c()) {
validVars <- c(paste0("Ret_", 2:120), paste0("Ret_pred_", 121:180))
if (length(blackList)) { validVars <- setdiff(validVars, blackList) }
previous <- (timePointToPredict-memory):(timePointToPredict-1)
previous <- intersect(c(paste0("Ret_", previous), paste0("Ret_pred_", previous)), validVars)
# winNums <- 1:(length(previous)-win)
winNums <- seq(1, length(previous)-win, by = ceiling(win/2))
cat("calculating indicators...\n")
for (i in winNums) {
winToUse <- previous[i:(i+win)]
data[[paste0("win_mean", i)]] <- apply(data[,winToUse], 1, function(x) mean(x, na.rm = TRUE))
data[[paste0("win_sumAbs", i)]] <- apply(data[,winToUse], 1, function(x) sum(abs(x), na.rm = TRUE))
data[[paste0("win_var", i)]] <- apply(data[,winToUse], 1, function(x) sum(abs(x), na.rm = TRUE))
}
return(data)
}
f.on.the.fly.intra.day.predictions <- function(train, test, timePointToPredict, predVars, memory = 30, win = 10, modMet = "rpart", evaMet = "adaptive_cv", blackList = c(), useQuantileNormalization = FALSE) {
#########################################################################################
### calculate some more indicators (sliding windows)
origPredVars <- predVars
cat("calculating sliding windows...\n")
train <- f.get.additional.variables.intra.test(train, timePointToPredict, memory, win, blackList)
test <- f.get.additional.variables.intra.test(test, timePointToPredict, memory, win, blackList)
winVars <- grep("^win_", colnames(train), value = TRUE)
sdev <- apply(train[,winVars], 2, function(x) sd(x, na.rm = TRUE))
winVars <- names(sdev)[sdev > 0]
#########################################################################################
### some preprocessing of the new variables
cat("preprocessing data...\n")
if (useQuantileNormalization) {
preproData <- f.normalize.quantiles(train, test, winVars, quant = 0.0001)
train <- preproData$train
test <- preproData$test
} else {
preproTrain <- preProcess(train[, winVars], method = c("center", "scale", "YeoJohnson"))
preproTest <- preProcess(test[, winVars], method = c("center", "scale", "YeoJohnson"))
train[,winVars] <- predict(preproTrain, train[, winVars])[,winVars]
test[,winVars] <- predict(preproTest, test[, winVars])[,winVars]
}
#########################################################################################
### check for correlation with windows and prefilter
origWinVars <- winVars
cat(length(winVars), "windows with sd > 0\n")
cors <- cor(train[,paste0("Ret_", timePointToPredict)], train[, winVars], method = "spearman")[1,]
winVars <- names(cors)[cors > quantile(cors, .75, na.rm = TRUE)]
#########################################################################################
# cat("subsetting data...\n")
toPredict <- paste0("Ret_", timePointToPredict)
#inTrain <- createDataPartition(y = train[[toPredict]], p = .5, list = FALSE)
#trainSet <- train[inTrain,]
#testSet <- train[-inTrain,]
#########################################################################################
predVars <- unique(c(predVars, winVars))
useAnyway <- paste0(ifelse(timePointToPredict == 121, "Ret_", "Ret_pred_"), timePointToPredict-1) # there is some autocorrelation here
predVars <- unique(predVars, useAnyway)
missing <- predVars[!(predVars %in% colnames(train))]
if (length(missing)) {cat(missing, "is not in the data frame\n")}
cat("running model with", length(predVars), "variables...\n")
newVar <- paste0("Ret_pred_", timePointToPredict)
numReps <- ifelse(modMet == "rpart", 50, 10)
myCtrl <- trainControl(method = evaMet, number = 10, repeats = numReps)
colsWithNA <- apply(train[,c(toPredict, predVars)], 2, function(x) sum(is.na(x)))
cat(names(colsWithNA)[colsWithNA > 0], "\n")
if (modMet == "ranger") {
formulaString <- paste0(toPredict, '~', paste(predVars, collapse = '+'))
cat(formulaString, "\n")
myFit <- train(formula(formulaString), train[,c(toPredict, predVars)], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
} else {
myFit <- train(x = train[,predVars], y = train[[toPredict]], method = modMet, tuneLength = 10, trControl = myCtrl, na.action = na.omit)
}
predValsForScore <- predict(myFit, train)
baseLine <- f.WMAE(train$Weight_Intraday, train[[toPredict]], rep(0, nrow(train)))
diffToNoPred <- f.WMAE(train$Weight_Intraday, train[[toPredict]], predValsForScore) - baseLine
cat("Difference to zero prediction:", diffToNoPred, "... in percent:", diffToNoPred/baseLine*100, "\n")
train[[newVar]] <- predict(myFit, train)
test[[newVar]] <- predict(myFit, test)
#########################################################################################
### NOTE: for security, remove the windows again
train <- train[, setdiff(colnames(train), origWinVars)]
test <- test[, setdiff(colnames(test), origWinVars)]
out <- list(train = train, test = test, vars = origPredVars, mod = myFit, score = diffToNoPred/baseLine*100)
return(out)
}
f.WMAE <- function(we, y, yHat) { sum(as.numeric(we)*abs(as.numeric(y)-as.numeric(yHat)))/length(we) }
#########################################################################################
### imputation - libraries
library("VIM")
library("mice")
library("randomForest")
library("cluster")
library("ranger")
library("dbscan")
library("rpart")
library("penalizedSVM")
library("mlr")
library("proxy")
library("party")
library("neuralnet")
library("brnn")
library("qrnn")
library("anfis")
library("frbs")
library("rPython")
library("caret")
library("rpart")
library("doMC")
registerDoMC(cores = 32)
#########################################################################################
### data reading and transformation
rDir <- "/home/ubuntu/Winton"
trainDataWithNA <- read.csv(file.path(rDir, "train.csv"), stringsAsFactors = FALSE, row.names = 1)
testDataWithNA <- read.csv(file.path(rDir, "test_2.csv"), stringsAsFactors = FALSE, row.names = 1)
rownames(trainDataWithNA) <- paste0("tr_", 1:nrow(trainDataWithNA))
rownames(testDataWithNA) <- paste0("te_", 1:nrow(testDataWithNA))
for (fea in paste0("Feature_", c(7,9,15))) {
trainDataWithNA[[fea]] <- f.stahel.trafo(trainDataWithNA[[fea]])
testDataWithNA[[fea]] <- f.stahel.trafo(testDataWithNA[[fea]])
}
commonCols <- intersect(colnames(trainDataWithNA), colnames(testDataWithNA))
comb <- rbind(trainDataWithNA[,commonCols], testDataWithNA[,commonCols])
#########################################################################################
### I noticed that the training data does not seem to fit to the test data:
# "good models" performed very poor on the test data, but good on the training data (cross-validation)
# several returns are significantly different in the training data compared to the test data (t-test)
# many approaches including the past returns and the features (data present in both, the train and the test data)
# failed miserably. In the end it looks like the future returns (no NAs as well) separate the training data
# nicely into two large groups. Take only the future returns for which the model fit was always good.
### try dbscan - here one could use the randomForest proximity?
futureReturns <- paste0("Ret_", c("PlusOne", "PlusTwo", 121:180))
kNNdistplot(trainDataWithNA[,futureReturns])
dbsCl <- dbscan(trainDataWithNA[,futureReturns], .02, length(futureReturns)+1)
table(dbsCl$cluster)
rowsToIgnore <- rownames(trainDataWithNA)[dbsCl$cluster != 1]
write.table(rowsToIgnore, file.path(rDir, "notInCenter_dbscan.txt"), row.names = FALSE, col.names = FALSE, quote = FALSE)
formulaString <- paste0('~', paste(futureReturns, collapse = '+'))
myPCA <- princomp(formula(formulaString), trainDataWithNA[, futureReturns], na.action = na.omit)
plot(myPCA)
trafoData <- as.matrix(trainDataWithNA[, futureReturns])%*%as.matrix(myPCA$loadings[,c("Comp.1", "Comp.2")])
numToCol <- c("black", "firebrick2", "dodgerblue2", "darkorchid2", "darkgoldenrod2")
plot(trafoData[,"Comp.1"], trafoData[,"Comp.2"], pch = 16, col = numToCol[dbsCl$cluster+1])
f.density.scatter(trafoData[,"Comp.1"], trafoData[,"Comp.2"])
#########################################################################################
### have a closer look at the dbscan result - they are really different
# after removing the "notInCenter" rows, Ret_PlusOne and Ret_PlusTwo have very good fits based on only feature_7
# but of course - online the submissions are really trashy
# really awkward is that the correlation between Ret_PlusTwo and Feature_7 is really good in the "notInCenter" rows
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRows <- setdiff(rownames(trainDataWithNA), toRemove)
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], trainDataWithNA[validRows, "Ret_PlusTwo"])
f.density.scatter(trainDataWithNA[toRemove, "Feature_7"], trainDataWithNA[toRemove, "Ret_PlusTwo"])
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], abs(trainDataWithNA[validRows, "Ret_PlusOne"]))
f.density.scatter(trainDataWithNA[validRows, "Feature_7"], abs(trainDataWithNA[validRows, "Ret_PlusTwo"]))
# also interesting, the correlations between the past returns are really different between the two sets
f.generic.correlation.matrix(trainDataWithNA[validRows,feaCols], rDir, "isCenter", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,feaCols], rDir, "notCenter", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[validRows,pastRet], rDir, "isCenter_pastRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,pastRet], rDir, "notCenter_pastRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[validRows,retCols], rDir, "isCenter_pastAndFutRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(trainDataWithNA[toRemove,retCols], rDir, "notCenter_pastAndFutRet", "spearman", corUse = "pairwise.complete.obs")
f.generic.correlation.matrix(testDataWithNA[,pastRet], rDir, "test_pastRet", "spearman", corUse = "pairwise.complete.obs")
# this here did not change too much though -.- solve with normalization
blackListBefore <- f.get.black.list(trainDataWithNA, testDataWithNA, colnames(testDataWithNA))
blackListAfter <- f.get.black.list(trainDataWithNA[setdiff(rownames(trainDataWithNA), toRemove),], testDataWithNA, colnames(testDataWithNA))
print(length(intersect(blackListBefore, blackListAfter))/length(union(blackListBefore, blackListAfter)))
#########################################################################################
### compare features and some past returns between train and test data - eventually use quantile normalization?
tempTrain <- trainDataWithNA[validRows,]
tempTest <- testDataWithNA
tempTrain$Ret_D0 <- apply(tempTrain[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
tempTest$Ret_D0 <- apply(tempTest[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
tiff(file.path(rDir, "train_vs_test", paste0("train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
abline(a = 0, b = 1, col = "red")
if (toCheck %in% paste0("Ret_", 2:120)) {
abline(h = 0.008, v= 0.008, col = "green")
abline(h = -0.008, v = -0.008, col = "green")
}
dev.off()
}
# categorial: 1, 5, 9, 10, 13, 20
# normal: 2, 3, 6, 7, 15, 21, 24
# extreme case but fitting: 4
# single outlier: 11, 19, 22
# bent end: 14
# knee/bump: 17, 18, 23, 25
# not ok at all: 8, 16 (factors with few levels)
# D0: abs() < 4e-4
# Ret_MinusOne/Two: abs() < 0.02
tempTrain <- f.filter.handpicked(tempTrain); nrow(tempTrain)
tempTest <- f.filter.handpicked(tempTest); nrow(tempTest)
tempTrain <- tempTrain[apply(tempTrain[,paste0("Ret_", 2:120)], 1, function(x) sum(abs(x) > .006, na.rm = TRUE)) == 0, ]; nrow(tempTrain)
tempTest <- tempTest[apply(tempTest[,paste0("Ret_", 2:120)], 1, function(x) sum(abs(x) > .006, na.rm = TRUE)) == 0, ]; nrow(tempTest)
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
tiff(file.path(rDir, "train_vs_test", paste0("filtered_train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .001), na.rm = TRUE)
f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
abline(a = 0, b = 1, col = "red")
dev.off()
}
validRowsTrain <- rownames(tempTrain)
validRowsTest <- rownames(tempTest)
write.table(validRowsTrain, file.path(rDir, "validRowsTrain.txt"), quote = FALSE, col.names = FALSE, row.names = FALSE) # 33103 / 32341
write.table(validRowsTest, file.path(rDir, "validRowsTest.txt"), quote = FALSE, col.names = FALSE, row.names = FALSE) # 101881 / 98764
# preproTrain <- preProcess(tempTrain[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))], method = c("center", "scale", "YeoJohnson"))
# preproTest <- preProcess(tempTest[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))], method = c("center", "scale", "YeoJohnson"))
# tempTrain <- predict(preproTrain, tempTrain[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))])
# tempTest <- predict(preproTest, tempTest[, c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Feature_", 1:25))])
contVars <- c("Ret_MinusOne", "Ret_MinusTwo", paste0("Ret_", 2:120), paste0("Feature_", c(2:4,6,7,11,12,14,15,17:19,21:25)))
preproData <- f.normalize.quantiles(tempTrain, tempTest, contVars, quant = 0.01)
tempTrain <- preproData$train
tempTest <- preproData$test
tempTrain$Ret_D0 <- apply(tempTrain[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
tempTest$Ret_D0 <- apply(tempTest[, paste0("Ret_", 2:120)], 1, function(x) mean(x, na.rm = TRUE)) # mean to deal better with the NAs
for (toCheck in c("Ret_MinusOne", "Ret_MinusTwo", "Ret_D0", paste0("Ret_", 2:120), paste0("Feature_", 1:25))) {
tiff(file.path(rDir, "train_vs_test", paste0("filtered_prePro_train_vs_test_", toCheck, ".tiff")), type = "Xlib", compression = "lzw", width = 500, height = 500, pointsize = 20)
testQ <- quantile(tempTest[[toCheck]], seq(0, 1, .01), na.rm = TRUE)
trainQ <- quantile(tempTrain[[toCheck]], seq(0, 1, .01), na.rm = TRUE)
f.scatter(trainQ, testQ, "train data", "test data", main = toCheck)
abline(a = 0, b = 1, col = "red")
dev.off()
}
#########################################################################################
### again an imputation - only with the "validRowsTrain" and "validRowsTest"
# MICE (5 times 10 imputations)
offset <- 0
offset <- 10
offset <- 20
offset <- 30
offset <- 40
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRowsTrain <- scan(file.path(rDir, "validRowsTrain.txt"), what = "character")
validRowsTest <- scan(file.path(rDir, "validRowsTest.txt"), what = "character")
invalidRowsTrain <- setdiff(paste0("tr_", 1:4e4), validRowsTrain); length(invalidRowsTrain)
invalidRowsTest <- setdiff(paste0("te_", 1:12e4), validRowsTest); length(invalidRowsTest)
invalidRowsTrain <- setdiff(invalidRowsTrain, toRemove); length(invalidRowsTrain)
validRows <- union(validRowsTrain, validRowsTest); length(validRows)
invalidRows <- union(invalidRowsTrain, invalidRowsTest); length(invalidRows)
for (toFactor in c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")) { comb[[toFactor]] <- as.factor(comb[[toFactor]]) }
combValid <- comb[validRows,]; nrow(combValid)
combImpAllValid <- mice(combValid, m = 10, seed = 123)
for (i in 1:10) {
temp <- complete(combImpAllValid, i)
write.csv(temp, file.path(rDir, "mice_final", paste0("VALID_all_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
write.csv(temp[validRowsTest,], file.path(rDir, "mice_final", paste0("VALID_test_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
trainDataWithNA[validRowsTrain, colnames(temp)] <- temp[validRowsTrain,]
write.csv(trainDataWithNA[validRowsTrain,], file.path(rDir, "mice_final", paste0("VALID_train_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
}
combInvalid <- comb[invalidRows,]; nrow(combInvalid)
combImpAllInvalid <- mice(combInvalid, m = 10, seed = 123)
for (i in 1:10) {
temp <- complete(combImpAllInvalid, i)
write.csv(temp, file.path(rDir, "mice_final", paste0("INVALID_all_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
write.csv(temp[invalidRowsTest,], file.path(rDir, "mice_final", paste0("INVALID_test_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
trainDataWithNA[invalidRowsTrain,colnames(temp)] <- temp[invalidRowsTrain,]
write.csv(trainDataWithNA[invalidRowsTrain,], file.path(rDir, "mice_final", paste0("INVALID_train_mice_imp_all_", offset+i, ".csv")), quote = FALSE)
}
# get also an average imputation from all the mice runs - VALID and INVALID data
library("VIM") # for maxCat
toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")
for (miceType in c("INVALID")) {
trainData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_train_mice_imp_all_", 1,".csv")), stringsAsFactors = FALSE, row.names = 1)
testData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_test_mice_imp_all_", 1,".csv")), stringsAsFactors = FALSE, row.names = 1)
for (cn in toFactor) {
trainData[[cn]] <- as.factor(trainData[[cn]])
testData[[cn]] <- as.factor(testData[[cn]])
}
tempTrain <- array(0, c(nrow(trainData), ncol(trainData), 50), list(rownames(trainData), colnames(trainData), paste0("iter_", 1:50)))
tempTest <- array(0, c(nrow(testData), ncol(testData), 50), list(rownames(testData), colnames(testData), paste0("iter_", 1:50)))
for (cn in colnames(trainData)) { tempTrain[rownames(trainData), cn, "iter_1"] <- trainData[[cn]] }
for (cn in colnames(testData)) { tempTest[rownames(testData), cn, "iter_1"] <- testData[[cn]] }
for (impRun in 2:50) {
cat("processing imputation", impRun, "...\n")
testData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_test_mice_imp_all_", impRun,".csv")), stringsAsFactors = FALSE, row.names = 1)
trainData <- read.csv(file.path(rDir, "mice_final", paste0(miceType, "_train_mice_imp_all_", impRun,".csv")), stringsAsFactors = FALSE, row.names = 1)
for (cn in toFactor) {
trainData[[cn]] <- as.factor(trainData[[cn]])
testData[[cn]] <- as.factor(testData[[cn]])
}
for (cn in colnames(trainData)) { tempTrain[rownames(trainData), cn, paste0("iter_", impRun)] <- trainData[[cn]] }
for (cn in colnames(testData)) { tempTest[rownames(testData), cn, paste0("iter_", impRun)] <- testData[[cn]] }
}
quantCols <- setdiff(colnames(testData),toFactor)
trainData[,toFactor] <- apply(tempTrain[,toFactor,], c(1,2), function(x) maxCat(x))
trainData[,quantCols] <- apply(tempTrain[,quantCols,], c(1,2), function(x) median(as.numeric(x)))
testData[,toFactor] <- apply(tempTest[,toFactor,], c(1,2), function(x) maxCat(x))
testData[,quantCols] <- apply(tempTest[,quantCols,], c(1,2), function(x) median(as.numeric(x)))
write.csv(testData, file.path(rDir, paste0(miceType, "_test_mice_imp_all_mean.csv")), quote = FALSE)
write.csv(trainData, file.path(rDir, paste0(miceType, "_train_mice_imp_all_mean.csv")), quote = FALSE)
}
#########################################################################################
### again an imputation - only with the "validRowsTrain" and "validRowsTest"
# kNN with VIM (should solve the categorial problem)
toRemove <- scan(file.path(rDir, "notInCenter_dbscan.txt"), what = "character")
validRowsTrain <- scan(file.path(rDir, "validRowsTrain.txt"), what = "character")
validRowsTest <- scan(file.path(rDir, "validRowsTest.txt"), what = "character")
invalidRowsTrain <- setdiff(paste0("tr_", 1:4e4), validRowsTrain); length(invalidRowsTrain)
invalidRowsTest <- setdiff(paste0("te_", 1:12e4), validRowsTest); length(invalidRowsTest)
invalidRowsTrain <- setdiff(invalidRowsTrain, toRemove); length(invalidRowsTrain)
validRows <- union(validRowsTrain, validRowsTest); length(validRows)
invalidRows <- union(invalidRowsTrain, invalidRowsTest); length(invalidRows)
toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")
for (cn in ) { comb[[cn]] <- as.factor(comb[[cn]]) }
combValid <- comb[validRows,]; nrow(combValid)
combInvalid <- comb[invalidRows,]; nrow(combInvalid)
knnValidIter <- f.simple.imputation.with.nearest.neighbors.sampled.VIM(combValid, splitInto = 8, numCores = 8)
knnValid <- combValid
knnValid[,toFactor] <- apply(knnValidIter[,toFactor,], c(1,2), function(x) maxCat(x))
knnValid[,setdiff(colnames(knnValid),toFactor)] <- apply(knnValidIter[,setdiff(colnames(knnValid),toFactor),], c(1,2), function(x) median(x))
knnInvalidIter <- f.simple.imputation.with.nearest.neighbors.sampled.VIM(combInvalid, splitInto = 8, numCores = 8)
knnInvalid <- combInvalid
knnInvalid[,toFactor] <- apply(knnInvalidIter[,toFactor,], c(1,2), function(x) maxCat(x))
knnInvalid[,setdiff(colnames(knnInvalid),toFactor)] <- apply(knnInvalidIter[,setdiff(colnames(knnInvalid),toFactor),], c(1,2), function(x) median(x))
for (cn in colnames(knnInvalid)) {
cat("processing", cn, "...\n")
trainDataWithNA[validRowsTrain, cn] <- knnValid[validRowsTrain, cn]
testDataWithNA[validRowsTest, cn] <- knnValid[validRowsTest, cn]
trainDataWithNA[invalidRowsTrain, cn] <- knnInvalid[invalidRowsTrain, cn]
testDataWithNA[invalidRowsTest, cn] <- knnInvalid[invalidRowsTest, cn]
}
write.csv(trainDataWithNA[setdiff(rownames(trainDataWithNA), toRemove), ], file.path(rDir, "BOTH_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA, file.path(rDir, "BOTH_kNN_test.csv"), quote = FALSE)
write.csv(trainDataWithNA[validRowsTrain, ], file.path(rDir, "VALID_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA[validRowsTest, ], file.path(rDir, "VALID_kNN_test.csv"), quote = FALSE)
write.csv(trainDataWithNA[invalidRowsTrain, ], file.path(rDir, "INVALID_kNN_train.csv"), quote = FALSE)
write.csv(testDataWithNA[invalidRowsTest, ], file.path(rDir, "INVALID_kNN_test.csv"), quote = FALSE)
#########################################################################################
### load data (check if there are still some NAs left - no)
rDir <- "/Users/marc/Documents/winton"
rDir <- "/home/ubuntu/Winton"
impType <- "INVALID"
trainDataMICE <- read.csv(file.path(rDir, paste0(impType, "_train_mice_imp_all_mean.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(trainDataMICE)
testDataMICE <- read.csv(file.path(rDir, paste0(impType, "_test_mice_imp_all_mean.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(testDataMICE)
trainDataKNN <- read.csv(file.path(rDir, paste0(impType, "_kNN_train.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(trainDataKNN)
testDataKNN <- read.csv(file.path(rDir, paste0(impType, "_kNN_test.csv")), stringsAsFactors = FALSE, row.names = 1); nrow(testDataKNN)
sum(apply(trainDataMICE, 1, function(x) sum(is.na(x)) > 0))
sum(apply(testDataMICE, 1, function(x) sum(is.na(x)) > 0))
sum(apply(trainDataKNN, 1, function(x) sum(is.na(x)) > 0))
sum(apply(testDataKNN, 1, function(x) sum(is.na(x)) > 0))
# toFactor <- c("Feature_1", "Feature_5", "Feature_10", "Feature_13", "Feature_16", "Feature_20")
#########################################################################################
### check scatterplots by eye
contVars <- setdiff(colnames(testDataMICE), toFactor)
preproDataMICE <- f.normalize.quantiles(trainDataMICE, testDataMICE, contVars, quant = 0.0001)
preproDataKNN <- f.normalize.quantiles(trainDataKNN, testDataKNN, contVars, quant = 0.0001)
for (ret in c("Ret_PlusOne", "Ret_PlusTwo")) {
for (fea in paste0("Feature_", 10:25)) {
tiff(file.path(rDir, "ret_vs_fea", paste0("validRows_raw_mice_knn_", ret, "_vs_", fea, ".tiff")), type = "Xlib", compression = "lzw", width = 1500, height = 1000, pointsize = 20)
if (fea %in% paste0("Feature_", c(10, 16))) {
f.compare.ret.and.fea.no.dens(trainDataMICE, trainDataKNN, ret, fea)
} else{
f.compare.ret.and.fea(trainDataMICE, trainDataKNN, ret, fea)
}
dev.off()
tiff(file.path(rDir, "ret_vs_fea", paste0("validRows_quantNorm_mice_knn_", ret, "_vs_", fea, ".tiff")), type = "Xlib", compression = "lzw", width = 1500, height = 1000, pointsize = 20)
if (fea %in% paste0("Feature_", c(10, 16))) {
f.compare.ret.and.fea.no.dens(preproDataMICE$train, preproDataKNN$train, ret, fea)
} else{
f.compare.ret.and.fea(preproDataMICE$train, preproDataKNN$train, ret, fea)
}
dev.off()
}
}
#########################################################################################
### predictions - in total we have now:
# data sets: VALID and INVALID
# imputations: MICE (take mean) and kNN
# normalizaiton: preProcess() and f.quantile.normalization()
# methods: cubist, ranger, rpart
# this will all run on different VMs...
jobs <- list()
jobs$MICE_preProcess_cubist <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE, modMet = "cubist", evalMet = "boot")
jobs$MICE_preProcess_randomForest <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE, modMet = "rf", evalMet = "boot")
jobs$MICE_preProcess_rpart <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = TRUE, useQuantileNormalization = FALSE, modMet = "rpart", evalMet = "repeatedcv")
jobs$MICE_quantNorm_cubist <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE, modMet = "cubist", evalMet = "boot")
jobs$MICE_quantNorm_randomForest <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE, modMet = "rf", evalMet = "boot")
jobs$MICE_quantNorm_rpart <- list(train = trainDataMICE, test = testDataMICE, doPreprocess = FALSE, useQuantileNormalization = TRUE, modMet = "rpart", evalMet = "repeatedcv")
jobs$KNN_preProcess_cubist <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE, modMet = "cubist", evalMet = "boot")
jobs$KNN_preProcess_randomForest <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE, modMet = "rf", evalMet = "boot")
jobs$KNN_preProcess_rpart <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = TRUE, useQuantileNormalization = FALSE, modMet = "rpart", evalMet = "repeatedcv")
jobs$KNN_quantNorm_cubist <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE, modMet = "cubist", evalMet = "boot")
jobs$KNN_quantNorm_randomForest <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE, modMet = "rf", evalMet = "boot")
jobs$KNN_quantNorm_rpart <- list(train = trainDataKNN, test = testDataKNN, doPreprocess = FALSE, useQuantileNormalization = TRUE, modMet = "rpart", evalMet = "repeatedcv")
nrow(trainDataMICE)
nrow(testDataMICE)
nrow(trainDataKNN)
nrow(testDataKNN)
#########################################################################################
### predictions
jobList <- c("MICE_preProcess_cubist")
jobList <- c("MICE_preProcess_randomForest")
jobList <- c("MICE_quantNorm_cubist")
jobList <- c("MICE_quantNorm_randomForest")
jobList <- c("KNN_preProcess_cubist")
jobList <- c("KNN_preProcess_randomForest")
jobList <- c("KNN_quantNorm_cubist")
jobList <- c("KNN_quantNorm_randomForest")
jobList <- grep("^MICE_preProcess", names(jobs), value = TRUE)[1:2]
jobList <- grep("^MICE_quantNorm", names(jobs), value = TRUE)[1:2]
jobList <- grep("^KNN_preProcess_", names(jobs), value = TRUE)[1:2]
jobList <- grep("^KNN_quantNorm_", names(jobs), value = TRUE)[1:2]
jobList <- grep("rpart$", names(jobs), value = TRUE)
for (currentJob in jobList) {
trainData <- jobs[[currentJob]]$train
testData <- jobs[[currentJob]]$test
doPrep <- jobs[[currentJob]]$doPreprocess
useQua <- jobs[[currentJob]]$useQuantileNormalization
modMet <- jobs[[currentJob]]$modMet
evaMet <- jobs[[currentJob]]$evalMet
toFact <- paste0("Feature_", c(1,5,10,13,16,20))
cat("doing interday predictions...\n")
preparedData <- f.prepare.for.models(trainData, testData, "inter", doPreprocess = doPrep, toFactor = toFact, noInt = TRUE, useQuantileNormalization = useQua)
scoresInter <- c()
predictions <- list()
for (toPredict in c("Ret_PlusOne", "Ret_PlusTwo")) {
cat("doing", currentJob, toPredict, "...\n")
tempRes <- f.inter.day.predictions(preparedData$train, preparedData$test, toPredict, preparedData$vars, modMet, evaMet)
predictions[[toPredict]] <- tempRes$pred
scoresInter <- c(scoresInter, tempRes$score)
}
out <- do.call("cbind", predictions); colnames(out) <- c("Ret_PlusOne", "Ret_PlusTwo"); rownames(out) <- rownames(testData)
write.table(scoresInter, file.path(rDir, paste0("scores_interday_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
write.csv(out, file.path(rDir, paste0("interday_", impType, "_", currentJob, ".csv")), quote = FALSE)
tiff(file.path(rDir, paste0("predCheck_", impType, "_", currentJob, ".tiff")), width = 600, height = 300)
layout(matrix(1:2, nrow = 1))
f.histogram(predictions$Ret_PlusOne)
f.histogram(predictions$Ret_PlusTwo)
dev.off()
cat("doing intraday predictions...\n")
preparedData <- f.prepare.for.models(trainData, testData, "intra", doPreprocess = doPrep, toFactor = toFact, noInt = TRUE, useQuantileNormalization = useQua)
scoresIntra <- c()
for (timePointToPredict in 121:180) {
cat("doing", currentJob, paste0("Ret_pred_", timePointToPredict), "...\n")
preparedData <- f.on.the.fly.intra.day.predictions(preparedData$train, preparedData$test, timePointToPredict, preparedData$vars, memory = 30, win = 6,
modMet = modMet, evaMet = evaMet, useQuantileNormalization = useQua)
scoresIntra <- c(scoresIntra, preparedData$score)
write.table(preparedData$test[, paste0("Ret_pred_", timePointToPredict)], file.path(rDir, paste0("Ret_pred_", timePointToPredict, "_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
}
write.table(scoresIntra, file.path(rDir, paste0("scores_intraday_", timePointToPredict, "_", impType, "_", currentJob, ".txt")), quote = FALSE, row.names = FALSE, col.names = FALSE)
write.csv(preparedData$test[, grep("^Ret_pred", colnames(preparedData$test), value = TRUE)], file.path(rDir, paste0("intraday_", impType, "_", currentJob, ".csv")), quote = FALSE)
}
#########################################################################################
### load the predictions and merge them
impType <- "VALID"
jobList <- c("KNN_quantNorm_cubist")
predictions <- list()
scores <- list()
for (dataSet in jobList) {
predictions[[dataSet]] <- matrix(0, nrow = 12e4, ncol = 62, dimnames = list(paste0("te_", 1:12e4), c("Ret_PlusOne", "Ret_PlusTwo", paste0("Ret_pred_", 121:180))))
for (impType in c("VALID", "INVALID")) {
for (prefix in c("interday", "intraday")) {
temp <- read.csv(file.path(rDir, "final_predictions", paste0(prefix, "_", impType, "_", dataSet, ".csv")), row.names = 1)
for (cn in colnames(temp)) {
predictions[[dataSet]][rownames(temp), cn] <- temp[,cn]
}
scores <- as.numeric(scan(file.path(rDir, "final_predictions", paste0("scores_", prefix, "_", impType, "_", dataSet, ".txt")), what = "double"))
if (prefix == "intraday") {
cat("discarding", sum(scores > 0), "bad fits\n")
predictions[[dataSet]][rownames(temp), which(scores > 0)] <- 0
cat("discarding", sum(scores < (-.05)), "too good fits\n")
predictions[[dataSet]][rownames(temp), which(scores < (-.05))] <- 0
}
}
}
}
for (dataSet in jobList) {
out <- data.frame(
Id = paste0(rep(1:12e4, each = 62), '_', 1:62),
Predicted = 0,
stringsAsFactors = FALSE
)
rownames(out) <- out$Id
for (id in 1:60) {
ret <- paste0("Ret_pred_", 120+id)
out[paste0(1:12e4, '_', id), "Predicted"] <- predictions[[dataSet]][,ret]
}
out[paste0(1:12e4, '_61'),"Predicted"] <- predictions[[dataSet]][,"Ret_PlusOne"]
out[paste0(1:12e4, '_62'),"Predicted"] <- predictions[[dataSet]][,"Ret_PlusTwo"]
write.csv(out, file.path(rDir, "final_predictions", paste0("FIN_", dataSet, "_onlyIntra.csv")), quote = FALSE, row.names = FALSE)
}