This is section for libraries, functions and overall styling settings.
# Rmarkdown global setting
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(cache = TRUE)
knitr::opts_chunk$set(fig.align = 'center')
#----------------------------------------------------------------------------------------------------------
# Import libraries
library(tidyverse)
library(tidycensus)
library(sf)
library(spdep)
library(caret)
library(ckanr)
library(FNN)
library(grid)
library(gridExtra)
library(ggcorrplot)# plot correlation plot
library(corrplot)
library(corrr) # another way to plot correlation plot
library(kableExtra)
library(jtools) # for regression model plots
library(ggstance) # to support jtools plots
library(ggpubr) # plotting R^2 value on ggplot point scatter
library(broom.mixed) # needed for effects plots
library(knitr)
library(rmarkdown)
library(RSocrata)
library(viridis)
library(ggplot2)
library(stargazer)
library(XML)
library(data.table)
library(ggpmisc)
library(patchwork)
library(spatstat)
library(raster)
library(classInt) # for KDE and ML risk class intervals
library(tableHTML)
library(pscl)
library(plotROC)
library(pROC)
library(lubridate)
#----------------------------------------------------------------------------------------------------------
# Temp
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")
root.dir = "https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/"
# Etc
options(scipen=999)
options(tigris_class = "sf")
#----------------------------------------------------------------------------------------------------------
# functions
st_c <- st_coordinates
st_coid <- st_centroid
mapThememin <- function(base_size = 10, title_size = 12, small_size = 8) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = title_size, colour = "black", hjust = 0.5),
plot.subtitle=element_text(size = base_size, colour = "black", hjust = 0.5, face="italic"),
plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
strip.text.x = element_text(size = small_size, face="italic"),
strip.text.y = element_text(size = small_size, face="italic"),
strip.background = element_rect(colour="transparent", fill="transparent"),
legend.title = element_text(size = small_size),
legend.text = element_text(size = small_size),
legend.key.size = unit(0.4, "cm"))
}
mapThememin2 <- function(base_size = 8, title_size = 10, small_size = 6) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = title_size, colour = "black", hjust = 0.5),
plot.subtitle=element_text(size = base_size, colour = "black", hjust = 0.5, face="italic"),
plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.background = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
strip.text.x = element_text(size = base_size),
strip.text.y = element_text(size = base_size),
strip.background = element_rect(colour="transparent", fill="transparent"),
legend.title = element_text(size = small_size),
legend.text = element_text(size = small_size),
legend.key.size = unit(0.25, "cm"))
}
corTheme <- function(base_size = 10, title_size = 12, small_size = 8){
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
text = element_text(size = 10),
panel.background = element_rect(fill = greyPalette5[1]),
axis.title.x = element_text(size = small_size),
axis.title.y = element_text(size = small_size),
plot.subtitle = element_text(hjust = 0.5, size = base_size),
plot.title = element_text(hjust = 0.5, size = title_size),
plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
strip.background = element_rect(colour="transparent", fill="transparent"))
}
corTheme2 <- function(base_size = 10, title_size = 12, small_size = 8){
theme(axis.text = element_text(size = small_size),
text = element_text(size = 10),
panel.background = element_rect(fill = greyPalette5[1]),
axis.title.x = element_text(size = small_size),
axis.title.y = element_text(size = small_size),
plot.subtitle = element_text(hjust = 0.5, size = base_size, face="italic"),
plot.title = element_text(hjust = 0.5, size = title_size),
plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
strip.background = element_rect(colour="transparent", fill="transparent"),
strip.text.x = element_text(size = small_size, face="italic"),
strip.text.y = element_text(size = base_size, face="italic"),
legend.text = element_text(size = small_size),
legend.title = element_text(size = small_size),
legend.key.size = unit(0.4, "cm"))
}
corTheme3 <- function(base_size = 9, title_size = 11, small_size = 7){
theme(axis.text = element_text(size = small_size),
text = element_text(size = 10),
panel.background = element_rect(fill = greyPalette5[1]),
axis.title.x = element_text(size = small_size),
axis.title.y = element_text(size = small_size),
plot.subtitle = element_text(hjust = 0.5, size = base_size, face="italic"),
plot.title = element_text(hjust = 0.5, size = title_size),
plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
legend.text = element_text(size = small_size))
}
corTheme4 <- function(base_size = 9, title_size = 11, small_size = 7){
theme(axis.text = element_text(size = small_size),
text = element_text(size = 10),
panel.background = element_rect(fill = greyPalette5[1]),
axis.title.x = element_text(size = small_size),
axis.title.y.right = element_text(size = small_size),
plot.subtitle = element_text(hjust = 0.5, size = base_size, face="italic"),
plot.title = element_text(hjust = 0.5, size = title_size),
plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5))
}
q5 <- function(variable) {as.factor(ntile(variable, 5))}
q <- function(variable) {as.factor(ntile(variable, 5))}
qBr <- function(df, variable, rnd) {
if (missing(rnd)) {
as.character(quantile(round(df[[variable]],0),
c(.01,.2,.4,.6,.8), na.rm=T))
} else if (rnd == FALSE | rnd == F) {
as.character(formatC(quantile(df[[variable]],
c(.01,.2,.4,.6,.8), na.rm=T),
digits = 3))
}
}
qBr2 <- function(df, variable, rnd) {
if (missing(rnd)) {
as.character(quantile(round(df[[variable]]*100,0)/100,
c(.01,.2,.4,.6,.8), na.rm=T))
} else if (rnd == FALSE | rnd == F) {
as.character(formatC(quantile(df[[variable]],
c(.01,.2,.4,.6,.8), na.rm=T),
digits = 3))
}
}
qBr3 <- function(df, variable, rnd) {
if (missing(rnd)) {
as.character(round(quantile(round(df[[variable]]*1000000,0),
c(.01,.2,.4,.6,.8), na.rm=T)),0)
} else if (rnd == FALSE | rnd == F) {
as.character(formatC(quantile(df[[variable]],
c(.01,.2,.4,.6,.8), na.rm=T),
digits = 3))
}
}
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
}
nn_function <- function(measureFrom,measureTo,k) {
measureFrom_Matrix <- as.matrix(measureFrom)
measureTo_Matrix <- as.matrix(measureTo)
nn <- get.knnx(measureTo, measureFrom, k)$nn.dist
output <- as.data.frame(nn) %>%
rownames_to_column(var = "thisPoint") %>%
gather(points, point_distance, V1:ncol(.)) %>%
arrange(as.numeric(thisPoint)) %>%
group_by(thisPoint) %>%
summarize(pointDistance = mean(point_distance)) %>%
arrange(as.numeric(thisPoint)) %>%
dplyr::select(-thisPoint) %>%
pull()
return(output)
}
myCrossValidate <- function(dataset, id, dependentVariable, indVariables) {
allPredictions <- data.frame()
cvID_list <- unique(dataset[[id]])
for (i in cvID_list) {
thisFold <- i
cat("This hold out fold is", thisFold, "\n")
fold.train <- filter(dataset, dataset[[id]] != thisFold) %>% as.data.frame() %>%
dplyr::select(id, geometry, indVariables, dependentVariable)
fold.test <- filter(dataset, dataset[[id]] == thisFold) %>% as.data.frame() %>%
dplyr::select(id, geometry, indVariables, dependentVariable)
regression <-
glm(countMVTheft ~ ., family = "poisson",
data = fold.train %>%
dplyr::select(-geometry, -id))
thisPrediction <-
mutate(fold.test, Prediction = predict(regression, fold.test, type = "response"))
allPredictions <-
rbind(allPredictions, thisPrediction)
}
return(st_sf(allPredictions))
}
#----------------------------------------------------------------------------------------------------------
# Colors ("https://coolors.co/gradient-palette/a8f368-f9035e?number=7")
bluePalette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
blue2Palette5 <- c("#08519c","#3182bd","#6baed6","#bdd7e7","#eff3ff")
H_bluePalette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#252525")
orangePalette5 <- c("#FFF2E8","#FFD6B6","#FEB984","#FE9D51","#FD801F")
orange2Palette5 <- c("#FFDFD0","#FFB89F","#FF926E","#FF6B3D","#FF440C")
greyPalette5 <- c("#f7f7f7","#cccccc","#969696","#636363","#252525")
greenPalette5 <- c("#edf8e9","#bae4b3","#74c476","#31a354","#006d2c")
purplePalette5 <- c("#f2f0f7","#cbc9e2","#9e9ac8","#756bb1","#54278f")
#----------------------------------------------------------------------------------------------------------
# LoadAPI(Min's key)
census_api_key("4bbe4bead4e5817f6a6b79e62c5bea69e77f1887", overwrite = TRUE)In this analysis, 4,119 samples of those eligible for housing subsidies were used. Of these, 89.05% did not receive housing subsidies, and 10.95% received subsidies.
# Tracts geometry, Census Information
housingSubsidy <- read.csv("https://github.com/urbanSpatial/Public-Policy-Analytics-Landing/raw/master/DATA/Chapter6/housingSubsidy.csv")
ggplot(housingSubsidy, aes(fill = y)) +
geom_bar(mapping = aes(x = y), width = 0.2)+
scale_fill_manual(values = c(bluePalette5[2],bluePalette5[4]),
name = "Housing\nSubsidy")+
labs(title = "Data distribution of Subsidy",
subtitle = "No:3668(89.05%) Yes:451(10.95%)",
caption = "Figure 1-1")+
geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]))The purpose of this analysis is to strategically allocate related resources by extracting groups that are willing to receive and have a relatively high probability of receiving housing subsidies, rather than randomly distributing resources related to housing subsidies. In addition, we will conduct a Cost/Benefit analysis at the same time in this process to draw the best conclusion.
Nine of the attributes associated with who eligible for Housing subsidy are identified as consecutive attributes. Figure 2-1-1 visualizes the average of each characteristic according to whether or not housing subsidies were received in the past. This plot alone seems difficult to understand how these properties actually relate. Therefore, we wrote Figure 2-1-2 for a more useful analysis. In ‘Age’, there is a clear difference in receiving grants from the age of about 60. ‘Campaign’, ‘cons.conf.idx’, and ‘cons.price.idx’ seem to have a hard time determining a clear correlation. The ‘inflation rate’ shows a difference in receiving subsidies starting at about 3%. “p-day” means days after last contacted from a previous program, and 999 is a dummy value that has never been contacted, and I think it can be converted into meaningful information through feature engineering. ‘previous’ seems to show a significant difference starting from zero. ‘spend_on_repairs’ shows a significant difference starting at approximately $5,160. ‘unempoly_rate’ shows a significant difference starting at approximately 0.5.
housingSubsidy_cf <- housingSubsidy %>%
dplyr::select(-X, -y_numeric) %>%
select_if(is.numeric) %>%
cbind(y = housingSubsidy$y) %>%
gather(Variable, value, -y)
ggplot(housingSubsidy_cf, aes(y, abs(value), fill=y)) +
geom_bar(position = "dodge", stat = "summary", fun = "mean", width = 0.3) +
facet_wrap(~Variable, scales = "free") +
scale_fill_manual(values = c(bluePalette5[2],bluePalette5[4]),
name = "Housing\nSubsidy") +
labs(x="Housing Subsidy", y="Mean value",
title = "Mean value of Feature associations with Housing Subsidy",
subtitle = "continous features",
caption = "Figure 2-1-1") +
geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]))ggplot(housingSubsidy_cf) +
geom_density(aes(value, linetype=y, color=y, ), fill = "transparent", size = 0.8) +
facet_wrap(~Variable, scales = "free") +
scale_color_manual(values = c(bluePalette5[3],bluePalette5[4])) +
labs(title = "Feature Distribution of taking Housing Subsidy",
subtitle = "continous features",
caption = "Figure 2-1-2")+
geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]))+
guides(linetype=guide_legend(title = "Housing\nSubsidy"),
color=guide_legend(title = "Housing\nSubsidy")) housingSubsidy_mcf <- housingSubsidy %>%
dplyr::select(-X, -y_numeric) %>%
select_if(is.character) %>%
gather(Variable, value, -y) %>%
count(Variable, value, y)
ggplot(housingSubsidy_mcf, aes(value, n, fill = y)) +
geom_bar(position = "dodge", stat="identity", width = 0.9) +
facet_wrap(~Variable, scales="free") +
scale_fill_manual(values = c(bluePalette5[2],bluePalette5[4]),
name = "Housing\nSubsidy") +
labs(x="Feature", y="Count",
title = "Feature associations with the likelihood of taking Housing Subsidy",
subtitle = "Two/Three category Features",
caption = "Figure 2-2-1") +
geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]),
axis.text.x = element_text(angle = 45, hjust = 1, size = 5))In this section, training and test sets are created. createDataPartition is used to split the data. A 65% sample is used here to reduce processing time.
# divide sets
set.seed(3456)
trainIndex <- createDataPartition(housingSubsidy$X, p = .65,
list = FALSE,
times = 1)
housingSubsityTrain <- housingSubsidy[ trainIndex,]
housingSubsityTest <- housingSubsidy[-trainIndex,]# regression model 1
housingSubsidyModel_1 <- glm(y_numeric ~ .,
data=housingSubsityTrain %>%
dplyr::select(-X, -y, -day_of_week),
family="binomial" (link="logit"))
#summary(housingSubsidyModel_1)
#pR2(housingSubsidyModel_1)In this chapter, we feature engineered categorical data consisting of three or more categories and continuous data with distinct critical points in the distribution into two and three or more categories.
#1 education : Bchmore / Bchless
#Bchmore : "university.degree""professional.course"
#Bchless : "basic.9y""high.school""basic.6y""basic.4y""illiterate"
#unknown : "unknown"
housingSubsidy <- housingSubsidy %>%
mutate(education_min = case_when(education == "university.degree"| education == "professional.course" ~ "Bchmore",
education == "basic.9y"| education == "high.school"| education == "basic.6y"| education == "basic.4y"
|education == "illiterate" ~ "Bchless"
,education == "unknown" ~ "unknown"))
#2 JOB: employed / unemployed
#employed : "blue-collar""services""admin.""entrepreneur""self-employed" "technician""management""housemaid"
#unemployed : "unemployed""student""retired"
#unknown : "unknown"
housingSubsidy <- housingSubsidy %>%
mutate(job_min = case_when(job == "blue-collar"|job =="services"|job =="admin."|job =="entrepreneur"|job =="self-employed"
|job =="technician"|job =="management"|job =="housemaid" ~ "employed",
job == "unemployed"|job =="student"|job =="retired"~ "unemployed",
job == "unknown"~ "unknown"))
#3 marital: single / non-single
#single : "single""divorced"
#non-single : "married"
#unknown : "unknown"
housingSubsidy <- housingSubsidy %>%
mutate(martial_min = case_when(marital == "single"|marital =="divorced" ~ "single",
marital == "married" ~ "married",
marital == "unknown" ~ "unknown"))
#4 month: spring_summer / autumn_winter
# spring_summer: "apr" "may" "jun" "jul" "aug"
# autumn_winter: "nov" "sep" "mar" "oct" "dec"
housingSubsidy <- housingSubsidy %>%
mutate(month_min = case_when(month == "apr" |month == "may" |month == "jun" |month =="jul"|month == "aug" ~ "spring_summer",
month == "nov" |month =="sep"|month == "mar"|month == "oct"|month == "dec" ~ "autumn_winter"))
#5 age: less55 / over55
# less55: =< 55
# over55: > 55
housingSubsidy <- housingSubsidy %>%
mutate(age_min = case_when(age <= 55 ~ "less55",
age > 55 ~ "over55"))
#6 inflation rate: less3 / over3
# less3: =< 3
# over3: > 3
housingSubsidy <- housingSubsidy %>%
mutate(inflation_rate_min = case_when(inflation_rate <= 3 ~ "less3",
inflation_rate > 3 ~ "over3"))
#7 pdays: contact / non-contact
# contact: < 999
# none-contact : >= 999
housingSubsidy <- housingSubsidy %>%
mutate(pdays_min = case_when(pdays < 999 ~ "contact",
pdays >= 999 ~ "non-contact"))
#8 previous: less0 / over0
# less0: =< 0
# over0: > 0
housingSubsidy <- housingSubsidy %>%
mutate(previous_min = case_when(previous <= 0 ~ "less0",
previous > 0 ~ "over0"))
#9 spent_on_repairs: less5160 / over5160
# less5160: =< 5160
# over5160: > 5160
housingSubsidy <- housingSubsidy %>%
mutate(spent_on_repairs_min = case_when(spent_on_repairs <= 5160 ~ "less5160",
spent_on_repairs > 5160 ~ "over5160"))
#10 unemploy_rate: less0.5 / over0.5
# less0.5: =< 0.5
# over0.5: > 0.5
housingSubsidy <- housingSubsidy %>%
mutate(unemploy_rate_min = case_when(unemploy_rate <= 0.5 ~ "less0.5",
unemploy_rate > 0.5 ~ "over0.5"))
housingSubsidy_fe <- housingSubsidy %>%
dplyr::select(y, education_min, job_min, martial_min, month_min, age_min, inflation_rate_min, pdays_min, previous_min, spent_on_repairs_min, unemploy_rate_min) %>%
gather(Variable, value, -y) %>%
count(Variable, value, y)
ggplot(housingSubsidy_fe, aes(value, n, fill = y)) +
geom_bar(position = "dodge", stat="identity", width = 0.9) +
facet_wrap(~Variable, scales="free") +
scale_fill_manual(values = c(bluePalette5[2],bluePalette5[4]),
name = "Housing\nSubsidy") +
labs(x="Feature", y="Counts",
title = "Feature associations with the likelihood of taking Housing Subsidy",
subtitle = "Feature engineered category features",
caption = "Figure 4-1-1") +
geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]),
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 5))
Basically, we created logistic regression model 1 using the data we had, which is called ‘the kitchen sink model, and logistic regression model 2 using feature engineered data. To measure the Goodness of fit of both model, the first and weakest option is the ‘Psuedo R Squared’, which, unlike regular R-Squared, does not vary linearly from 0 to 1. It does not describe the proportion of the variance in the dependent variable explained by the model. However, it is useful for quickly comparing different model specifications, which may be helpful for feature selection. Below, the ‘McFadden R-Squared’ is demonstrated - the higher, the better. According to the Table 4-2, of model 1 is higher than that of model 2, it implies that model 1 is better than model 2.
rm12 <- cbind(rm1, rm2)
kable(rm12, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 4-2-1. Regression Summary</span></center>",
col.names = c("Regression Model 1 : kitchen sink", "Regression Model 2 : engineered"), align = "c") %>%
kable_minimal(full_width = T, html_font = "Arial", font_size = 14) %>%
row_spec(4, color = greyPalette5[5], background = greyPalette5[1], bold = T) %>%
row_spec(0:6, extra_css = "line-height: 35px")| Regression Model 1 : kitchen sink | Regression Model 2 : engineered | |
|---|---|---|
| llh | -722.6105414 | -748.2550259 |
| llhNull | -931.0501911 | -931.0501911 |
| G2 | 416.8792995 | 365.5903304 |
| McFadden | 0.2238758 | 0.1963322 |
| r2ML | 0.1441071 | 0.1275633 |
| r2CU | 0.2876608 | 0.2546368 |
# model_1
testProbs_1 <- data.frame(Outcome = as.factor(housingSubsityTest$y_numeric),
Probs = predict(housingSubsidyModel_1, housingSubsityTest, type= "response"))
testProbs_1 <-
testProbs_1 %>%
mutate(predOutcome = as.factor(ifelse(testProbs_1$Probs > 0.5 , 1, 0)),
Model = 1)
# model_2
testProbs_2 <- data.frame(Outcome = as.factor(housingSubsityTest$y_numeric),
Probs = predict(housingSubsidyModel_2, housingSubsityTest, type= "response"))
testProbs_2 <-
testProbs_2 %>%
mutate(predOutcome = as.factor(ifelse(testProbs_2$Probs > 0.5 , 1, 0)),
Model = 2)
testProbs_12 <- rbind(testProbs_1,testProbs_2)
gg431 <- ggplot(testProbs_1, aes(x = Probs, fill = as.factor(Outcome))) +
geom_density() +
facet_grid(Outcome ~ .) +
scale_fill_manual(values = c(bluePalette5[2], bluePalette5[4])) + xlim(0, 1) +
geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
geom_vline(xintercept = 0.5, size = 0.4, color = bluePalette5[3], linetype = "longdash") +
labs(x = "Probability of Getting Subsidy", y = "Density",
title = "Regression Model 1",
subtitle = "kitchen sink",
caption = "Figure 4-3-1") +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
legend.position = "none",
strip.text.x = element_blank(),
strip.text.y = element_blank())
Outcome.labs <- c("Don't Get Subsidy", "Get Subsidy")
names(Outcome.labs) <-c(0, 1)
gg432 <- ggplot(testProbs_2, aes(x = Probs, fill = as.factor(Outcome))) +
geom_density() +
facet_grid(Outcome ~ .,labeller = as_labeller(Outcome.labs)) +
scale_fill_manual(values = c(bluePalette5[2], bluePalette5[4])) + xlim(0, 1) +
geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
geom_vline(xintercept = 0.5, size = 0.4, color = bluePalette5[3], linetype = "longdash") +
scale_y_continuous(position = "right")+
labs(x = "Probability of Getting Subsidy", y = "",
title = "Regression Model 2",
subtitle = "engineered",
caption = "Figure 4-3-2") +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
legend.position = "none")
gg431 + gg432 Looking at the blue vertical dotted line drawn in Figures 4-3-1, 4-3-2, it is drawn with a 50% probability. Based on this threshold, if we say we’re going to get subsidies if we go above 50%, and if we don’t go above 50%, is our model a good model? Table 4-3 below provides indicators to confirm this. Here, it is generally considered a good model if the sensitivity and specificity are high. However, the low sensitivity seems to need to be improved because the purpose of this survey is to accurately predict the people who will receive the housing subsidy. To confirm this, in the next chapter, it will be necessary to create a confusion matrix for all thresholds and estimate the optimal thresholds.
# model_1
cf1 <- caret::confusionMatrix(testProbs_1$predOutcome, testProbs_1$Outcome, positive = "1")
cf2 <- caret::confusionMatrix(testProbs_2$predOutcome, testProbs_2$Outcome, positive = "1")
cf <- round(rbind(cf1$byClass, cf2$byClass),3)
rownames(cf) <- c("Kitchen sink model","Engineered model")
colnames(cf) <- c("Sens","Spec","Pos Pred","Neg Pred","Precis","Recall","F1","Preval","Det Rate","Det Preval", "Bal Accu")
kable(cf, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 4-3. confusion metrics for the threshold of 50% of both model</span></center>", align = "c", booktabs = T, escape=FALSE) %>%
kable_minimal(full_width = T, html_font = "Arial", font_size = 13)%>%
column_spec(1, bold = T) %>%
row_spec(1:2, extra_css = "line-height: 30px")| Sens | Spec | Pos Pred | Neg Pred | Precis | Recall | F1 | Preval | Det Rate | Det Preval | Bal Accu | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Kitchen sink model | 0.239 | 0.987 | 0.685 | 0.915 | 0.685 | 0.239 | 0.354 | 0.108 | 0.026 | 0.038 | 0.613 |
| Engineered model | 0.239 | 0.988 | 0.712 | 0.915 | 0.712 | 0.239 | 0.357 | 0.108 | 0.026 | 0.036 | 0.614 |
In Figures 4-3-3, 4-3-4, the ROC curve was prepared. This curve shows the relationship between True Positive and False Positive for all thresholds, and the gray diagonal makes predictions in the absence of a model, in other words, because it means predicting with the probability of coin flip, if the ROC curve is formed below the gray diagonal, it becomes an inferior model. This graph provides a very good intuition for the fit of the model, especially by comparing AUC:area under curve, we can compare the ratios of True positive. Looking at the AUC of the two models, the Engineered model is 0.781 and the Kitchen sink model is 0.755, indicating that the Engineered model is a better model.
auc1 <- round(pROC::auc(testProbs_1$Outcome, testProbs_1$Probs),3)
auc2 <- round(pROC::auc(testProbs_2$Outcome, testProbs_2$Probs),3)
auc1 <- paste("kitchen sink\n\n", "AUC:", auc1)
auc2 <- paste("engineered\n\n","AUC:", auc2)
ggauc1 <- ggplot(testProbs_1, aes(d = as.numeric(testProbs_1$Outcome), m = Probs)) +
geom_roc(n.cuts = 30, labels = FALSE, colour = bluePalette5[3], pointsize = 0.4, size = 0.6) +
style_roc(theme = theme_grey) +
geom_abline(slope = 1, intercept = 0, size = 0.4, color = greyPalette5[3]) +
labs(title = "ROC curve",
subtitle = auc1,
caption = "Figure 4-3-3",
y = "True Positive Fraction") +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
legend.position = "none")
ggauc2 <- ggplot(testProbs_2, aes(d = as.numeric(testProbs_2$Outcome), m = Probs)) +
geom_roc(n.cuts = 30, labels = FALSE, colour = bluePalette5[4], pointsize = 0.4, size = 0.6) +
style_roc(theme = theme_grey) +
geom_abline(slope = 1, intercept = 0, size = 0.4, color = greyPalette5[3]) +
scale_y_continuous(position = "right")+
labs(title = "ROC curve",
subtitle = auc2,
caption = "Figure 4-3-4",
y = "True Positive Fraction")+
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
legend.position = "none")
ggauc1 + ggauc2In this section, each model is divided into 100 folds and each ROC, Sensitivity, and Specificity is calculated. Similarly, the Engineered model appears to be much better than kitchen sink model with higher value of Sensitivity. Looking at Figure 4-4-2, generalizability is also shown to be superior because sensitivity is also focused on the mean value of the engineered model. However, Specificity shows a worrying level of Outlier in both models.
ctrl2 <- trainControl(method = "cv", number = 100, classProbs=TRUE, summaryFunction=twoClassSummary)
cv_1 <- train(y ~ .,
data=housingSubsidy %>%
dplyr::select(-X, -inflation_rate_min, -pdays_min, -previous_min, -spent_on_repairs_min, -unemploy_rate_min,
-education_min, -job_min, -martial_min, -month_min, -age_min, -y_numeric),
method="glm", family="binomial", metric="ROC", trControl = ctrl2)
cv_gfm1 <- round(cv_1$results[2:4],4)
cv_2 <- train(y ~ .,
data=housingSubsidy %>%
dplyr::select(-X, -education, -job, -marital, -month, -age, -day_of_week, -inflation_rate_min, -pdays_min, -previous_min, -spent_on_repairs_min, -unemploy_rate_min, -y_numeric),
method="glm", family="binomial",
metric="ROC", trControl = ctrl2)
cv_gfm2 <- round(cv_2$results[2:4],4)
cv_gfm <- rbind(cv_gfm1, cv_gfm2)
rownames(cv_gfm) <- c("Kitchen sink model","Engineered model")
colnames(cv_gfm) <- c("ROC(AUC)","Sensitivity","Specificity")
cv_gfm <- cv_gfm %>% mutate(Sensitivity = cell_spec(Sensitivity, background = ifelse(Sensitivity > 0.982, bluePalette5[2], "white"),
bold = ifelse(Sensitivity > 0.982, T, F)))
kable(cv_gfm, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 4-4-1. The Sensitivity(True Positive Rate) of each model</span></center>", align = "c", booktabs = T, escape=FALSE) %>%
kable_minimal(full_width = F, html_font = "Arial", font_size = 16)%>%
column_spec(2:4, width = "5.3cm") %>%
column_spec(1, bold = T) %>%
row_spec(1:2, extra_css = "line-height: 30px")| ROC(AUC) | Sensitivity | Specificity | |
|---|---|---|---|
| Kitchen sink model | 0.7645 | 0.9802 | 0.2155 |
| Engineered model | 0.7725 | 0.9858 | 0.2160 |
cv_1_resample <- cv_1$resample %>%
dplyr::select(-Resample) %>%
gather(metric, value) %>% # column to row with value (100obj > 300obj)
left_join(gather(cv_1$results[2:4], metric, mean)) %>% #metric is key to left join
mutate(Model = "Kitchen Sink Model")
cv_2_resample <- cv_2$resample %>%
dplyr::select(-Resample) %>%
gather(metric, value) %>% # column to row with value (100obj > 300obj)
left_join(gather(cv_2$results[2:4], metric, mean)) %>% #metric is key to left join
mutate(Model = "Engineered_Model")
cv_12_resample <- rbind(cv_1_resample,cv_2_resample)
ggplot(cv_12_resample, aes(value)) +
geom_histogram(bins=35, fill = bluePalette5[3], color = "white") +
facet_wrap(vars(Model, metric)) +
geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
geom_vline(aes(xintercept = mean), colour = bluePalette5[4], linetype = 4, size = 0.7) +
scale_x_continuous(limits = c(0, 1)) +
labs(x="Goodness of Fit", y="Count", title="CV Goodness of Fit Metrics",
subtitle = "Across-fold mean reprented as dotted lines",
caption = "Figure 4-4-2")+
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
legend.position = "none")
We have previously checked the suitability of the current prediction model from various perspectives. However, the purpose of this study is to derive the best scenario for maximum effectiveness by allocating limited resources. This requires estimating the costs and benefits of each case. We estimated the total of four categories of Revenue as follows.
True Positive : Predicted correctly homeowner would enter credit program allocated the marketing resources, and 25% ultimately achieved the credit
| Cost/benefit | $ |
|---|---|
| session costs(mailers, phone calls, and information/counseling) | -$2,850 |
| subsidy costs | -$5,000 * 0.25 |
| premium after taking credit | +$10,000 * 0.25 |
| Neighborhood premium | +$56,000 * 0.25 |
| Total Revenue | +$12,400 |
True Negative : Predicted correctly homeowner would not take the credit, no marketing resources were allocated, and no credit was allocated
False Positive : Predicted incorrectly homeowner would take the credit; allocated marketing resources; no credit allocated.
| Cost/benefit | $ |
|---|---|
| session costs(mailers, phone calls, and information/counseling) | -$2,850 |
| Total Revenue | -$2,850 |
False Negative : We predicted that a homeowner would not take the credit but they did. These are likely homeowners who signed up for reasons unrelated to the marketing campaign. Thus, we ‘0 out’ this category, assuming the cost/benefit of this is $0.
cost_benefit_table <-
testProbs_2 %>%
count(predOutcome, Outcome) %>%
summarize(True_Negative = sum(n[predOutcome==0 & Outcome==0]),
True_Positive = sum(n[predOutcome==1 & Outcome==1]),
False_Negative = sum(n[predOutcome==0 & Outcome==1]),
False_Positive = sum(n[predOutcome==1 & Outcome==0])) %>%
gather(Variable, Count) %>%
mutate(Revenue =
ifelse(Variable == "True_Negative", Count * 0,
ifelse(Variable == "True_Positive",(Count * (-2850-5000*0.25+10000*0.25+56000*0.25)),
ifelse(Variable == "False_Negative", Count * 0,
ifelse(Variable == "False_Positive", Count * (-2850), 0))))) %>%
bind_cols(data.frame(Description = c(
"We predicted correctly homeowner would not take the credit",
"We predicted correctly homeowner would enter credit program",
"We predicted that a homeowner would not take the credit but they did",
"We predicted incorrectly homeowner would take the credit")))
kable(cost_benefit_table, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 5-1. Cost/Benefit Table of 50% thresholds</span></center>",
align = "c") %>%
kable_minimal(full_width = T, html_font = "Arial", font_size = 15) %>%
column_spec(1, bold = TRUE) %>%
row_spec(c(2,4), background = greyPalette5[1]) %>%
row_spec(1:4, extra_css = "line-height: 35px") | Variable | Count | Revenue | Description |
|---|---|---|---|
| True_Negative | 1270 | 0 | We predicted correctly homeowner would not take the credit |
| True_Positive | 37 | 458800 | We predicted correctly homeowner would enter credit program |
| False_Negative | 118 | 0 | We predicted that a homeowner would not take the credit but they did |
| False_Positive | 15 | -42750 | We predicted incorrectly homeowner would take the credit |
In this section, we’ll apply cost-benefit equations to our regression model to determine revenue at each threshold. Figure 5-2-1 shows how much the cost is incurred for each threshold. Up to the threshold of 0.17 you can see that the cost reduction effect of decreasing False Negative and the revenue reduction effect of True Positive decrease offset each other, but if the threshold increases further, the revenue decrease faster than that of False Negative.
iterateThresholds <- function(data) {
x = .01
all_prediction <- data.frame()
while (x <= 1) {
this_prediction <- testProbs_2 %>%
mutate(predOutcome = ifelse(Probs > x, 1, 0)) %>%
count(predOutcome, Outcome) %>%
summarize(True_Negative = sum(n[predOutcome==0 & Outcome==0]),
True_Positive = sum(n[predOutcome==1 & Outcome==1]),
False_Negative = sum(n[predOutcome==0 & Outcome==1]),
False_Positive = sum(n[predOutcome==1 & Outcome==0])) %>%
gather(Variable, Count) %>%
mutate(Revenue =
ifelse(Variable == "True_Negative", Count * 0,
ifelse(Variable == "True_Positive",(Count * (-2850-5000*0.25+10000*0.25+56000*0.25)),
ifelse(Variable == "False_Negative", Count * 0,
ifelse(Variable == "False_Positive", Count * (-2850), 0)))),
Threshold = x)
all_prediction <- rbind(all_prediction, this_prediction)
x <- x + .01
}
return(all_prediction)
}
whichThreshold <- iterateThresholds(testProbs2)
whichThreshold %>%
ggplot(.,aes(Threshold, Revenue, colour = Variable)) +
geom_point() +
scale_colour_manual(values = bluePalette5[2:5]) +
labs(title = "Revenue by confusion matrix type and threshold",
y = "Revenue",
caption = "Figure 5-2-1") +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.35, color = greyPalette5[2]),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.35, color = greyPalette5[2]),
legend.position = "bottom",
legend.background = element_rect(fill = "white")) +
guides(colour=guide_legend(title = "Confusion Matrix")) In Figure 5-2-3, as the threshold increases, there are more and more people who receive the actual subsidy, but in Figure 5-2-2, we can see that when the threshold is 0.17, we get the most revenue. It seems that it can be an important indicator of judgment when it is aimed at maximizing the effectiveness of using limited resources.
whichThreshold_revenue_subsidy <-
whichThreshold %>%
mutate(Credits =
ifelse(Variable == "True_Positive", Count*0.25,
ifelse(Variable == "False_Negative", Count, 0))) %>%
group_by(Threshold) %>%
summarize(Revenue = sum(Revenue)/1000,
Total_Count_of_Credits = sum(Credits))
gg522 <- ggplot(whichThreshold_revenue_subsidy, aes(Threshold, Revenue)) +
geom_point(color = bluePalette5[3], size = 1.2) +
scale_colour_manual(values = bluePalette5) +
labs(title = "Revenue by threshold",
y = "Revenue(1,000$)",
caption = "Figure 5-2-2") +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
legend.position = "bottom",
axis.text.y = element_text(angle = 90)) +
guides(colour=guide_legend(title = "Confusion Matrix"))
gg523 <- ggplot(whichThreshold_revenue_subsidy, aes(Threshold, Total_Count_of_Credits)) +
geom_point(color = bluePalette5[4], size = 1.2) +
scale_colour_manual(values = bluePalette5) +
scale_y_continuous(position = "right")+
labs(title = "Total Counts of Credits by threshold",
y = "Total Counts of Credits",
caption = "Figure 5-2-3") +
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
legend.position = "bottom",
axis.text.y = element_text(angle = 270)) +
guides(colour=guide_legend(title = "Confusion Matrix"))
gg522 + gg523
The threshold is a baseline for how to analyze each probability obtained as a result of a logistic regression. Intuitively, it seems that if the probability from the predictive model exceeds 50%, it can be interpreted as taking subsidies, but this is not an indicator of the suitability of the model. In our cost/profit model, the maximum profit can be generated only by maximizing the True positive and minimizing the False positive. Here we can think of Sensitivity again. Sensitivity, which also called True Positive Rate, refers to the proportion of actual subsidies among the results of predicting receiving subsidies. Figure 5-3-1 visualizes the Revenue corresponding to each threshold. It can be seen that the Revenue rises rapidly up to the critical point of 0.17, and then decreases continuously after that. In Table 5-3, the actual return of 50% and the optimal threshold of 17% were compared with the recipient of the subsidy. The overall profit differs by about $339,400, and the optimal threshold of 17% is 181% of the revenue when the threshold is 50%. This can be seen as a meaningful level.
whichThreshold_revenue_subsidy <-
whichThreshold %>%
mutate(Credits =
ifelse(Variable == "True_Positive", Count*0.25,
ifelse(Variable == "False_Negative", Count, 0))) %>%
group_by(Threshold) %>%
summarize(Revenue = sum(Revenue),
Total_Count_of_Credits = sum(Credits))
finalTable <- whichThreshold_revenue_subsidy %>%
filter(Threshold >= 0.50 & Threshold < 0.51 | Threshold >= pull(arrange(whichThreshold_revenue_subsidy, -Revenue)[1,1]) & Threshold < pull(arrange(whichThreshold_revenue_subsidy, -Revenue)[1,1])+0.01 )
kable(finalTable, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 5-3. Cost/Benefit Comparison Table</span></center>",
align = "c") %>%
kable_minimal(full_width = T, html_font = "Arial", font_size = 16)%>%
column_spec(1:2, bold = TRUE) %>%
row_spec(1, color = orangePalette5[5]) %>%
row_spec(2, color = bluePalette5[4]) %>%
row_spec(1:2, extra_css = "line-height: 30px") | Threshold | Revenue | Total_Count_of_Credits |
|---|---|---|
| 0.17 | 755450 | 95.00 |
| 0.50 | 416050 | 127.25 |
ggplot(whichThreshold_revenue_subsidy)+
geom_line(aes(x = Threshold, y = Revenue), color = greyPalette5[3], size = 0.8)+
geom_vline(xintercept = pull(arrange(whichThreshold_revenue_subsidy, -Revenue)[1,1]), color = orangePalette5[5], size = 0.8, linetype = "longdash")+
geom_vline(xintercept = 0.5, color = bluePalette5[4], size = 0.8, linetype = "longdash")+
scale_x_continuous(minor_breaks = seq(0, 1, 0.05))+
labs(title = "Model Revenues By Threshold",
subtitle = "Orange line : 0.17 Optimal Threshold Blue line : 0.5 Threshold",
caption = "Figure 5-3-1")+
corTheme2()+
theme(panel.background = element_blank(),
panel.grid.major.y = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
panel.grid.major.x = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
panel.grid.minor.x = element_line(linetype = "dotted", size = 0.45, color = greyPalette5[2]),
legend.position = "bottom",
axis.text.y = element_text(angle = 90))The purpose of this analysis is not to allocate limited resources at random, but to allocate resources strategically for maximum policy effect. Of course, this is a fairly narrow-minded analysis because it analyzes policies only in terms of cost and benefit. If we allocated resources randomly, the sensitivity would be 50%. However, according to CV tests, we have almost 90% more sensitivity, which means we have almost exactly identified who will receive the subsidy. In addition, in each case, we estimated costs and benefits to derive thresholds that make up the most profitable groups in practice. This can be a very useful tool of judgment in terms of cost and profit.