Ⅰ. Setup

Ⅱ. Data

Ⅲ. Process

Ⅳ. Conclustion

Ⅰ. Setup

This is section for libraries, functions and overall styling settings.

# Rmarkdown global setting
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(cache = TRUE)
knitr::opts_chunk$set(fig.align = 'center')

#----------------------------------------------------------------------------------------------------------  

# Import libraries
library(tidyverse)
library(tidycensus)
library(sf)
library(spdep)
library(caret)
library(ckanr)
library(FNN)
library(grid)
library(gridExtra)
library(ggcorrplot)# plot correlation plot
library(corrplot)
library(corrr)      # another way to plot correlation plot
library(kableExtra)
library(jtools)     # for regression model plots
library(ggstance) # to support jtools plots
library(ggpubr)    # plotting R^2 value on ggplot point scatter
library(broom.mixed) # needed for effects plots
library(knitr)
library(rmarkdown)
library(RSocrata)
library(viridis)
library(ggplot2)
library(stargazer)
library(XML)
library(data.table)
library(ggpmisc)
library(patchwork)
library(spatstat)
library(raster)
library(classInt)   # for KDE and ML risk class intervals
library(tableHTML)
library(pscl)
library(plotROC)
library(pROC)
library(lubridate)

#----------------------------------------------------------------------------------------------------------  

# Temp
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")
root.dir = "https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/"

# Etc
options(scipen=999)
options(tigris_class = "sf")

#----------------------------------------------------------------------------------------------------------  

# functions

st_c    <- st_coordinates
st_coid <- st_centroid

mapThememin <- function(base_size = 10, title_size = 12, small_size = 8) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = title_size, colour = "black", hjust = 0.5),
    plot.subtitle=element_text(size = base_size, colour = "black", hjust = 0.5, face="italic"),
    plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.background = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    strip.text.x = element_text(size = small_size, face="italic"),
    strip.text.y = element_text(size = small_size, face="italic"),
    strip.background = element_rect(colour="transparent", fill="transparent"),
    legend.title = element_text(size = small_size),
    legend.text = element_text(size = small_size),
    legend.key.size = unit(0.4, "cm"))
}

mapThememin2 <- function(base_size = 8, title_size = 10, small_size = 6) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = title_size, colour = "black", hjust = 0.5),
    plot.subtitle=element_text(size = base_size, colour = "black", hjust = 0.5, face="italic"),
    plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.background = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    strip.text.x = element_text(size = base_size),
    strip.text.y = element_text(size = base_size),
    strip.background = element_rect(colour="transparent", fill="transparent"),
    legend.title = element_text(size = small_size),
    legend.text = element_text(size = small_size),
    legend.key.size = unit(0.25, "cm"))
}


corTheme <- function(base_size = 10, title_size = 12, small_size = 8){
  theme(axis.text =  element_blank(), 
        axis.ticks = element_blank(), 
        text = element_text(size = 10),
        panel.background = element_rect(fill = greyPalette5[1]),
        axis.title.x = element_text(size = small_size),
        axis.title.y = element_text(size = small_size),
        plot.subtitle = element_text(hjust = 0.5, size = base_size),
        plot.title = element_text(hjust = 0.5, size = title_size),
        plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
        strip.background = element_rect(colour="transparent", fill="transparent"))
}


corTheme2 <- function(base_size = 10, title_size = 12, small_size = 8){
  theme(axis.text =  element_text(size = small_size),
        text = element_text(size = 10),
        panel.background = element_rect(fill = greyPalette5[1]),
        axis.title.x = element_text(size = small_size),
        axis.title.y = element_text(size = small_size),
        plot.subtitle = element_text(hjust = 0.5, size = base_size,  face="italic"),
        plot.title = element_text(hjust = 0.5, size = title_size),
        plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
        strip.background = element_rect(colour="transparent", fill="transparent"),
        strip.text.x = element_text(size = small_size, face="italic"),
        strip.text.y = element_text(size = base_size, face="italic"),
        legend.text = element_text(size = small_size),
        legend.title = element_text(size = small_size),
        legend.key.size = unit(0.4, "cm"))
}

corTheme3 <- function(base_size = 9, title_size = 11, small_size = 7){
  theme(axis.text =  element_text(size = small_size),
        text = element_text(size = 10),
        panel.background = element_rect(fill = greyPalette5[1]),
        axis.title.x = element_text(size = small_size),
        axis.title.y = element_text(size = small_size),
        plot.subtitle = element_text(hjust = 0.5, size = base_size,  face="italic"),
        plot.title = element_text(hjust = 0.5, size = title_size),
        plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5),
        legend.text = element_text(size = small_size))
}

corTheme4 <- function(base_size = 9, title_size = 11, small_size = 7){
  theme(axis.text =  element_text(size = small_size),
        text = element_text(size = 10),
        panel.background = element_rect(fill = greyPalette5[1]),
        axis.title.x = element_text(size = small_size),
        axis.title.y.right = element_text(size = small_size),
        plot.subtitle = element_text(hjust = 0.5, size = base_size,  face="italic"),
        plot.title = element_text(hjust = 0.5, size = title_size),
        plot.caption=element_text(size = small_size, colour = "black", hjust = 0.5))
}


q5 <- function(variable) {as.factor(ntile(variable, 5))}

q <- function(variable) {as.factor(ntile(variable, 5))}

qBr <- function(df, variable, rnd) {
  if (missing(rnd)) {
    as.character(quantile(round(df[[variable]],0),
                          c(.01,.2,.4,.6,.8), na.rm=T))
  } else if (rnd == FALSE | rnd == F) {
    as.character(formatC(quantile(df[[variable]],
                                  c(.01,.2,.4,.6,.8), na.rm=T),
                         digits = 3))
  }
}

qBr2 <- function(df, variable, rnd) {
  if (missing(rnd)) {
    as.character(quantile(round(df[[variable]]*100,0)/100,
                          c(.01,.2,.4,.6,.8), na.rm=T))
  } else if (rnd == FALSE | rnd == F) {
    as.character(formatC(quantile(df[[variable]],
                                  c(.01,.2,.4,.6,.8), na.rm=T),
                         digits = 3))
  }
}

qBr3 <- function(df, variable, rnd) {
  if (missing(rnd)) {
    as.character(round(quantile(round(df[[variable]]*1000000,0),
                          c(.01,.2,.4,.6,.8), na.rm=T)),0)
  } else if (rnd == FALSE | rnd == F) {
    as.character(formatC(quantile(df[[variable]],
                                  c(.01,.2,.4,.6,.8), na.rm=T),
                         digits = 3))
  }
}

substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}


nn_function <- function(measureFrom,measureTo,k) {
  measureFrom_Matrix <- as.matrix(measureFrom)
  measureTo_Matrix <- as.matrix(measureTo)
  nn <- get.knnx(measureTo, measureFrom, k)$nn.dist
  output <- as.data.frame(nn) %>%
    rownames_to_column(var = "thisPoint") %>%
    gather(points, point_distance, V1:ncol(.)) %>%
    arrange(as.numeric(thisPoint)) %>%
    group_by(thisPoint) %>%
    summarize(pointDistance = mean(point_distance)) %>%
    arrange(as.numeric(thisPoint)) %>% 
    dplyr::select(-thisPoint) %>%
    pull()
  return(output)  
}


myCrossValidate <- function(dataset, id, dependentVariable, indVariables) {
  
  allPredictions <- data.frame()
  cvID_list <- unique(dataset[[id]])
  
  for (i in cvID_list) {
    
    thisFold <- i
    cat("This hold out fold is", thisFold, "\n")
    
    fold.train <- filter(dataset, dataset[[id]] != thisFold) %>% as.data.frame() %>% 
      dplyr::select(id, geometry, indVariables, dependentVariable)
    fold.test  <- filter(dataset, dataset[[id]] == thisFold) %>% as.data.frame() %>% 
      dplyr::select(id, geometry, indVariables, dependentVariable)
    
    regression <-
      glm(countMVTheft ~ ., family = "poisson", 
          data = fold.train %>% 
            dplyr::select(-geometry, -id))
    
    thisPrediction <- 
      mutate(fold.test, Prediction = predict(regression, fold.test, type = "response"))
    
    allPredictions <-
      rbind(allPredictions, thisPrediction)
    
  }
  return(st_sf(allPredictions))
}

#----------------------------------------------------------------------------------------------------------  

# Colors ("https://coolors.co/gradient-palette/a8f368-f9035e?number=7")
bluePalette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
blue2Palette5 <- c("#08519c","#3182bd","#6baed6","#bdd7e7","#eff3ff")
H_bluePalette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#252525")
orangePalette5 <- c("#FFF2E8","#FFD6B6","#FEB984","#FE9D51","#FD801F")
orange2Palette5 <- c("#FFDFD0","#FFB89F","#FF926E","#FF6B3D","#FF440C")
greyPalette5 <- c("#f7f7f7","#cccccc","#969696","#636363","#252525")
greenPalette5 <- c("#edf8e9","#bae4b3","#74c476","#31a354","#006d2c")
purplePalette5 <- c("#f2f0f7","#cbc9e2","#9e9ac8","#756bb1","#54278f")

#----------------------------------------------------------------------------------------------------------  

# LoadAPI(Min's key)
census_api_key("4bbe4bead4e5817f6a6b79e62c5bea69e77f1887", overwrite = TRUE)

Ⅱ. Data

In this analysis, 4,119 samples of those eligible for housing subsidies were used. Of these, 89.05% did not receive housing subsidies, and 10.95% received subsidies.

# Tracts geometry, Census Information

housingSubsidy <- read.csv("https://github.com/urbanSpatial/Public-Policy-Analytics-Landing/raw/master/DATA/Chapter6/housingSubsidy.csv")

ggplot(housingSubsidy, aes(fill = y)) +
  geom_bar(mapping = aes(x = y), width = 0.2)+
  scale_fill_manual(values = c(bluePalette5[2],bluePalette5[4]),
                    name = "Housing\nSubsidy")+
  labs(title = "Data distribution of Subsidy",
       subtitle = "No:3668(89.05%)   Yes:451(10.95%)",
       caption = "Figure 1-1")+
  geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]))

Ⅲ. Process

1. Motivation for analysis

The purpose of this analysis is to strategically allocate related resources by extracting groups that are willing to receive and have a relatively high probability of receiving housing subsidies, rather than randomly distributing resources related to housing subsidies. In addition, we will conduct a Cost/Benefit analysis at the same time in this process to draw the best conclusion.

2. Exploratory analysis

2.1 Continuous features

Nine of the attributes associated with who eligible for Housing subsidy are identified as consecutive attributes. Figure 2-1-1 visualizes the average of each characteristic according to whether or not housing subsidies were received in the past. This plot alone seems difficult to understand how these properties actually relate. Therefore, we wrote Figure 2-1-2 for a more useful analysis. In ‘Age’, there is a clear difference in receiving grants from the age of about 60. ‘Campaign’, ‘cons.conf.idx’, and ‘cons.price.idx’ seem to have a hard time determining a clear correlation. The ‘inflation rate’ shows a difference in receiving subsidies starting at about 3%. “p-day” means days after last contacted from a previous program, and 999 is a dummy value that has never been contacted, and I think it can be converted into meaningful information through feature engineering. ‘previous’ seems to show a significant difference starting from zero. ‘spend_on_repairs’ shows a significant difference starting at approximately $5,160. ‘unempoly_rate’ shows a significant difference starting at approximately 0.5.

housingSubsidy_cf <- housingSubsidy %>%
  dplyr::select(-X, -y_numeric) %>%
  select_if(is.numeric) %>% 
  cbind(y = housingSubsidy$y) %>%
  gather(Variable, value, -y)

ggplot(housingSubsidy_cf, aes(y, abs(value), fill=y)) + 
  geom_bar(position = "dodge", stat = "summary", fun = "mean", width = 0.3) + 
  facet_wrap(~Variable, scales = "free") +
  scale_fill_manual(values = c(bluePalette5[2],bluePalette5[4]),
                    name = "Housing\nSubsidy") +
  labs(x="Housing Subsidy", y="Mean value",
       title = "Mean value of Feature associations with Housing Subsidy",
       subtitle = "continous features",
       caption = "Figure 2-1-1") +
  geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]))

ggplot(housingSubsidy_cf) + 
  geom_density(aes(value, linetype=y, color=y, ), fill = "transparent", size = 0.8) + 
  facet_wrap(~Variable, scales = "free") +
  scale_color_manual(values = c(bluePalette5[3],bluePalette5[4])) +
  labs(title = "Feature Distribution of taking Housing Subsidy",
       subtitle = "continous features",
       caption = "Figure 2-1-2")+
  geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]))+
  guides(linetype=guide_legend(title = "Housing\nSubsidy"),
         color=guide_legend(title = "Housing\nSubsidy")) 

2.2 Categorical Features

housingSubsidy_mcf <- housingSubsidy %>%
  dplyr::select(-X, -y_numeric) %>%
  select_if(is.character) %>% 
  gather(Variable, value, -y) %>%
  count(Variable, value, y)

ggplot(housingSubsidy_mcf, aes(value, n, fill = y)) +
  geom_bar(position = "dodge", stat="identity", width = 0.9) +
  facet_wrap(~Variable, scales="free") +
  scale_fill_manual(values = c(bluePalette5[2],bluePalette5[4]),
                    name = "Housing\nSubsidy") +
  labs(x="Feature", y="Count",
       title = "Feature associations with the likelihood of taking Housing Subsidy",
       subtitle = "Two/Three category Features",
       caption = "Figure 2-2-1") +
  geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

3. Training/Testing sets

In this section, training and test sets are created. createDataPartition is used to split the data. A 65% sample is used here to reduce processing time.

# divide sets

set.seed(3456)
trainIndex <- createDataPartition(housingSubsidy$X, p = .65,
                                  list = FALSE,
                                  times = 1)

housingSubsityTrain <- housingSubsidy[ trainIndex,]
housingSubsityTest  <- housingSubsidy[-trainIndex,]

# regression model 1

housingSubsidyModel_1 <- glm(y_numeric ~ .,
                    data=housingSubsityTrain %>% 
                    dplyr::select(-X, -y, -day_of_week),
                  family="binomial" (link="logit"))

#summary(housingSubsidyModel_1)

#pR2(housingSubsidyModel_1)

4. Feature Engineering

4.1 Feature engineering multiple category into Two/Three category Features

In this chapter, we feature engineered categorical data consisting of three or more categories and continuous data with distinct critical points in the distribution into two and three or more categories.

#1 education : Bchmore / Bchless

#Bchmore : "university.degree""professional.course"
#Bchless : "basic.9y""high.school""basic.6y""basic.4y""illiterate"
#unknown : "unknown"    

housingSubsidy <- housingSubsidy %>% 
  mutate(education_min = case_when(education == "university.degree"| education == "professional.course" ~  "Bchmore",
                                   education == "basic.9y"| education == "high.school"| education == "basic.6y"| education == "basic.4y"
                                   |education == "illiterate" ~  "Bchless"
                                   ,education == "unknown" ~ "unknown")) 


#2 JOB: employed / unemployed 

#employed : "blue-collar""services""admin.""entrepreneur""self-employed" "technician""management""housemaid"      
#unemployed : "unemployed""student""retired"
#unknown : "unknown"

housingSubsidy <- housingSubsidy %>% 
  mutate(job_min = case_when(job == "blue-collar"|job =="services"|job =="admin."|job =="entrepreneur"|job =="self-employed"
                             |job =="technician"|job =="management"|job =="housemaid" ~ "employed",
                             job == "unemployed"|job =="student"|job =="retired"~ "unemployed",
                             job == "unknown"~ "unknown"))


#3 marital: single / non-single

#single :  "single""divorced"
#non-single : "married" 
#unknown : "unknown" 

housingSubsidy <- housingSubsidy %>% 
  mutate(martial_min = case_when(marital == "single"|marital =="divorced" ~ "single",
                                 marital == "married" ~ "married",
                                 marital == "unknown" ~ "unknown"))

#4 month: spring_summer / autumn_winter 
 

# spring_summer: "apr" "may" "jun" "jul" "aug"
# autumn_winter: "nov" "sep" "mar" "oct" "dec"


housingSubsidy <- housingSubsidy %>% 
  mutate(month_min = case_when(month == "apr" |month == "may" |month == "jun" |month =="jul"|month == "aug" ~ "spring_summer",
                                 month == "nov" |month =="sep"|month == "mar"|month == "oct"|month == "dec" ~ "autumn_winter"))


#5 age: less55 / over55 

# less55: =< 55
# over55: > 55

housingSubsidy <- housingSubsidy %>% 
  mutate(age_min = case_when(age <= 55 ~ "less55",
                             age > 55 ~ "over55"))

#6 inflation rate: less3 / over3 

# less3: =< 3
# over3: > 3

housingSubsidy <- housingSubsidy %>% 
  mutate(inflation_rate_min = case_when(inflation_rate <= 3 ~ "less3",
                                        inflation_rate > 3 ~ "over3"))

#7 pdays: contact / non-contact 

# contact: < 999
# none-contact : >= 999

housingSubsidy <- housingSubsidy %>% 
  mutate(pdays_min = case_when(pdays < 999 ~ "contact",
                                        pdays >= 999 ~ "non-contact"))

#8 previous: less0 / over0 

# less0: =< 0
# over0: > 0

housingSubsidy <- housingSubsidy %>% 
  mutate(previous_min = case_when(previous <= 0 ~ "less0",
                                  previous > 0 ~ "over0"))

#9 spent_on_repairs: less5160 / over5160 

# less5160: =< 5160
# over5160: > 5160

housingSubsidy <- housingSubsidy %>% 
  mutate(spent_on_repairs_min = case_when(spent_on_repairs <= 5160 ~ "less5160",
                                          spent_on_repairs > 5160 ~ "over5160"))

#10 unemploy_rate: less0.5 / over0.5 

# less0.5: =< 0.5
# over0.5: > 0.5

housingSubsidy <- housingSubsidy %>% 
  mutate(unemploy_rate_min = case_when(unemploy_rate <= 0.5 ~ "less0.5",
                                          unemploy_rate > 0.5 ~ "over0.5"))



housingSubsidy_fe <- housingSubsidy %>%
    dplyr::select(y, education_min, job_min, martial_min, month_min, age_min, inflation_rate_min, pdays_min, previous_min, spent_on_repairs_min, unemploy_rate_min) %>%
    gather(Variable, value, -y) %>%
    count(Variable, value, y)

ggplot(housingSubsidy_fe, aes(value, n, fill = y)) +   
    geom_bar(position = "dodge", stat="identity", width = 0.9) +
    facet_wrap(~Variable, scales="free") +
    scale_fill_manual(values = c(bluePalette5[2],bluePalette5[4]),
                      name = "Housing\nSubsidy") +
    labs(x="Feature", y="Counts",
         title = "Feature associations with the likelihood of taking Housing Subsidy",
         subtitle = "Feature engineered category features",
         caption = "Figure 4-1-1") +
  geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "longdash", size = 0.25, color = greyPalette5[2]),
        axis.text.x = element_text(angle = 0, hjust = 0.5, size = 5))

4.2 New regression model after feature engineering

Basically, we created logistic regression model 1 using the data we had, which is called ‘the kitchen sink model, and logistic regression model 2 using feature engineered data. To measure the Goodness of fit of both model, the first and weakest option is the ‘Psuedo R Squared’, which, unlike regular R-Squared, does not vary linearly from 0 to 1. It does not describe the proportion of the variance in the dependent variable explained by the model. However, it is useful for quickly comparing different model specifications, which may be helpful for feature selection. Below, the ‘McFadden R-Squared’ is demonstrated - the higher, the better. According to the Table 4-2, $R^2$ of model 1 is higher than that of model 2, it implies that model 1 is better than model 2.

rm12 <- cbind(rm1, rm2)

kable(rm12, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 4-2-1. Regression Summary</span></center>",
      col.names = c("Regression Model 1 : kitchen sink", "Regression Model 2 : engineered"), align = "c") %>% 
    kable_minimal(full_width = T, html_font = "Arial", font_size = 14) %>%
    row_spec(4, color = greyPalette5[5], background = greyPalette5[1], bold = T) %>%
    row_spec(0:6, extra_css = "line-height: 35px")

Table 4-2-1. Regression Summary
	Regression Model 1 : kitchen sink	Regression Model 2 : engineered
llh	-722.6105414	-748.2550259
llhNull	-931.0501911	-931.0501911
G2	416.8792995	365.5903304
McFadden	0.2238758	0.1963322
r2ML	0.1441071	0.1275633
r2CU	0.2876608	0.2546368

4.3 Make prediction of Two models : ROC curve & AUC

# model_1

testProbs_1 <- data.frame(Outcome = as.factor(housingSubsityTest$y_numeric),
                          Probs = predict(housingSubsidyModel_1, housingSubsityTest, type= "response"))

testProbs_1 <- 
  testProbs_1 %>%
  mutate(predOutcome  = as.factor(ifelse(testProbs_1$Probs > 0.5 , 1, 0)),
         Model = 1)


# model_2

testProbs_2 <- data.frame(Outcome = as.factor(housingSubsityTest$y_numeric),
                          Probs = predict(housingSubsidyModel_2, housingSubsityTest, type= "response"))

testProbs_2 <- 
  testProbs_2 %>%
  mutate(predOutcome  = as.factor(ifelse(testProbs_2$Probs > 0.5 , 1, 0)),
         Model = 2)

testProbs_12 <- rbind(testProbs_1,testProbs_2)


gg431 <- ggplot(testProbs_1, aes(x = Probs, fill = as.factor(Outcome))) + 
  geom_density() +
  facet_grid(Outcome ~ .) +
  scale_fill_manual(values = c(bluePalette5[2], bluePalette5[4])) + xlim(0, 1) +
  geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
  geom_vline(xintercept = 0.5, size = 0.4, color = bluePalette5[3], linetype = "longdash") +
  labs(x = "Probability of Getting Subsidy", y = "Density",
       title = "Regression Model 1",
       subtitle = "kitchen sink",
       caption = "Figure 4-3-1")  +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
        legend.position = "none",
        strip.text.x = element_blank(),
        strip.text.y = element_blank())

Outcome.labs <- c("Don't Get Subsidy", "Get Subsidy")
names(Outcome.labs) <-c(0, 1)

gg432 <- ggplot(testProbs_2, aes(x = Probs, fill = as.factor(Outcome))) + 
  geom_density() +
  facet_grid(Outcome ~ .,labeller = as_labeller(Outcome.labs)) +
  scale_fill_manual(values = c(bluePalette5[2], bluePalette5[4])) + xlim(0, 1) +
  geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
  geom_vline(xintercept = 0.5, size = 0.4, color = bluePalette5[3], linetype = "longdash") +
  scale_y_continuous(position = "right")+
  labs(x = "Probability of Getting Subsidy", y = "",
       title = "Regression Model 2",
       subtitle = "engineered",
       caption = "Figure 4-3-2")  +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
        legend.position = "none")


gg431 + gg432 

Looking at the blue vertical dotted line drawn in Figures 4-3-1, 4-3-2, it is drawn with a 50% probability. Based on this threshold, if we say we’re going to get subsidies if we go above 50%, and if we don’t go above 50%, is our model a good model? Table 4-3 below provides indicators to confirm this. Here, it is generally considered a good model if the sensitivity and specificity are high. However, the low sensitivity seems to need to be improved because the purpose of this survey is to accurately predict the people who will receive the housing subsidy. To confirm this, in the next chapter, it will be necessary to create a confusion matrix for all thresholds and estimate the optimal thresholds.

# model_1

cf1 <- caret::confusionMatrix(testProbs_1$predOutcome, testProbs_1$Outcome, positive = "1")
cf2 <- caret::confusionMatrix(testProbs_2$predOutcome, testProbs_2$Outcome, positive = "1")

cf <- round(rbind(cf1$byClass, cf2$byClass),3)

rownames(cf) <- c("Kitchen sink model","Engineered model")
colnames(cf) <- c("Sens","Spec","Pos Pred","Neg Pred","Precis","Recall","F1","Preval","Det Rate","Det Preval", "Bal Accu")


kable(cf, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 4-3. confusion metrics for the threshold of 50% of both model</span></center>", align = "c", booktabs = T, escape=FALSE) %>%
  kable_minimal(full_width = T, html_font = "Arial", font_size = 13)%>%
  column_spec(1, bold = T)  %>%
  row_spec(1:2, extra_css = "line-height: 30px")

Table 4-3. confusion metrics for the threshold of 50% of both model
	Sens	Spec	Pos Pred	Neg Pred	Precis	Recall	F1	Preval	Det Rate	Det Preval	Bal Accu
Kitchen sink model	0.239	0.987	0.685	0.915	0.685	0.239	0.354	0.108	0.026	0.038	0.613
Engineered model	0.239	0.988	0.712	0.915	0.712	0.239	0.357	0.108	0.026	0.036	0.614

In Figures 4-3-3, 4-3-4, the ROC curve was prepared. This curve shows the relationship between True Positive and False Positive for all thresholds, and the gray diagonal makes predictions in the absence of a model, in other words, because it means predicting with the probability of coin flip, if the ROC curve is formed below the gray diagonal, it becomes an inferior model. This graph provides a very good intuition for the fit of the model, especially by comparing AUC:area under curve, we can compare the ratios of True positive. Looking at the AUC of the two models, the Engineered model is 0.781 and the Kitchen sink model is 0.755, indicating that the Engineered model is a better model.

auc1 <- round(pROC::auc(testProbs_1$Outcome, testProbs_1$Probs),3)
auc2 <- round(pROC::auc(testProbs_2$Outcome, testProbs_2$Probs),3)

auc1 <- paste("kitchen sink\n\n", "AUC:", auc1)
auc2 <- paste("engineered\n\n","AUC:", auc2)


ggauc1 <- ggplot(testProbs_1, aes(d = as.numeric(testProbs_1$Outcome), m = Probs)) +
  geom_roc(n.cuts = 30, labels = FALSE, colour = bluePalette5[3], pointsize = 0.4, size = 0.6) +
  style_roc(theme = theme_grey) +
  geom_abline(slope = 1, intercept = 0, size = 0.4, color = greyPalette5[3]) +
  labs(title = "ROC curve",
       subtitle = auc1,
       caption = "Figure 4-3-3",
       y = "True Positive Fraction") +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
        legend.position = "none")


ggauc2 <- ggplot(testProbs_2, aes(d = as.numeric(testProbs_2$Outcome), m = Probs)) +
  geom_roc(n.cuts = 30, labels = FALSE, colour = bluePalette5[4], pointsize = 0.4, size = 0.6) +
  style_roc(theme = theme_grey) +
  geom_abline(slope = 1, intercept = 0, size = 0.4, color = greyPalette5[3]) +
  scale_y_continuous(position = "right")+
  labs(title = "ROC curve",
       subtitle = auc2,
       caption = "Figure 4-3-4",
       y = "True Positive Fraction")+
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
        legend.position = "none")


ggauc1 + ggauc2

4.4 Cross validation of both models

In this section, each model is divided into 100 folds and each ROC, Sensitivity, and Specificity is calculated. Similarly, the Engineered model appears to be much better than kitchen sink model with higher value of Sensitivity. Looking at Figure 4-4-2, generalizability is also shown to be superior because sensitivity is also focused on the mean value of the engineered model. However, Specificity shows a worrying level of Outlier in both models.

ctrl2 <- trainControl(method = "cv", number = 100, classProbs=TRUE, summaryFunction=twoClassSummary)

cv_1 <- train(y ~ .,
              data=housingSubsidy %>%
                dplyr::select(-X, -inflation_rate_min, -pdays_min, -previous_min, -spent_on_repairs_min, -unemploy_rate_min,
                              -education_min, -job_min, -martial_min, -month_min, -age_min, -y_numeric), 
                method="glm", family="binomial", metric="ROC", trControl = ctrl2)

cv_gfm1 <- round(cv_1$results[2:4],4)

cv_2 <- train(y ~ .,
              data=housingSubsidy %>% 
                dplyr::select(-X, -education, -job, -marital, -month, -age, -day_of_week, -inflation_rate_min, -pdays_min, -previous_min, -spent_on_repairs_min, -unemploy_rate_min, -y_numeric), 
                method="glm", family="binomial",
                metric="ROC", trControl = ctrl2)

cv_gfm2 <- round(cv_2$results[2:4],4)

cv_gfm <- rbind(cv_gfm1, cv_gfm2) 

rownames(cv_gfm) <- c("Kitchen sink model","Engineered model")
colnames(cv_gfm) <- c("ROC(AUC)","Sensitivity","Specificity")

cv_gfm <- cv_gfm %>% mutate(Sensitivity = cell_spec(Sensitivity, background = ifelse(Sensitivity > 0.982, bluePalette5[2], "white"),
                                                     bold = ifelse(Sensitivity > 0.982, T, F)))

kable(cv_gfm, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 4-4-1. The Sensitivity(True Positive Rate) of each model</span></center>", align = "c", booktabs = T, escape=FALSE) %>%
  kable_minimal(full_width = F, html_font = "Arial", font_size = 16)%>%
  column_spec(2:4, width = "5.3cm") %>%
  column_spec(1, bold = T)  %>%
  row_spec(1:2, extra_css = "line-height: 30px")

Table 4-4-1. The Sensitivity(True Positive Rate) of each model
	ROC(AUC)	Sensitivity	Specificity
Kitchen sink model	0.7645	0.9802	0.2155
Engineered model	0.7725	0.9858	0.2160

cv_1_resample <- cv_1$resample %>%
  dplyr::select(-Resample) %>%
  gather(metric, value) %>% # column to row with value (100obj > 300obj)
  left_join(gather(cv_1$results[2:4], metric, mean)) %>% #metric is key to left join
  mutate(Model = "Kitchen Sink Model")


cv_2_resample <- cv_2$resample %>%
  dplyr::select(-Resample) %>%
  gather(metric, value) %>% # column to row with value (100obj > 300obj)
  left_join(gather(cv_2$results[2:4], metric, mean)) %>% #metric is key to left join
  mutate(Model = "Engineered_Model")

cv_12_resample <- rbind(cv_1_resample,cv_2_resample)

ggplot(cv_12_resample, aes(value)) + 
    geom_histogram(bins=35, fill = bluePalette5[3], color = "white") +
    facet_wrap(vars(Model, metric)) +
    geom_hline(yintercept = 0, size = 0.25, color = greyPalette5[3]) +
    geom_vline(aes(xintercept = mean), colour = bluePalette5[4], linetype = 4, size = 0.7) +
    scale_x_continuous(limits = c(0, 1)) +
    labs(x="Goodness of Fit", y="Count", title="CV Goodness of Fit Metrics",
         subtitle = "Across-fold mean reprented as dotted lines",
         caption = "Figure 4-4-2")+
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.25, color = greyPalette5[2]),
        legend.position = "none")

5. Cost-Benefit analysis

5.1 Cost-Benefit eqaution

We have previously checked the suitability of the current prediction model from various perspectives. However, the purpose of this study is to derive the best scenario for maximum effectiveness by allocating limited resources. This requires estimating the costs and benefits of each case. We estimated the total of four categories of Revenue as follows.

True Positive : Predicted correctly homeowner would enter credit program allocated the marketing resources, and 25% ultimately achieved the credit

Cost/benefit	$
session costs(mailers, phone calls, and information/counseling)	-$2,850
subsidy costs	-$5,000 * 0.25
premium after taking credit	+$10,000 * 0.25
Neighborhood premium	+$56,000 * 0.25
Total Revenue	+$12,400

True Negative : Predicted correctly homeowner would not take the credit, no marketing resources were allocated, and no credit was allocated

False Positive : Predicted incorrectly homeowner would take the credit; allocated marketing resources; no credit allocated.

Cost/benefit	$
session costs(mailers, phone calls, and information/counseling)	-$2,850
Total Revenue	-$2,850

False Negative : We predicted that a homeowner would not take the credit but they did. These are likely homeowners who signed up for reasons unrelated to the marketing campaign. Thus, we ‘0 out’ this category, assuming the cost/benefit of this is $0.

cost_benefit_table <-
   testProbs_2 %>%
      count(predOutcome, Outcome) %>%
      summarize(True_Negative = sum(n[predOutcome==0 & Outcome==0]),
                True_Positive = sum(n[predOutcome==1 & Outcome==1]),
                False_Negative = sum(n[predOutcome==0 & Outcome==1]),
                False_Positive = sum(n[predOutcome==1 & Outcome==0])) %>%
       gather(Variable, Count) %>%
       mutate(Revenue =
               ifelse(Variable == "True_Negative", Count * 0,
               ifelse(Variable == "True_Positive",(Count * (-2850-5000*0.25+10000*0.25+56000*0.25)),
               ifelse(Variable == "False_Negative", Count * 0,
               ifelse(Variable == "False_Positive", Count * (-2850), 0))))) %>%
    bind_cols(data.frame(Description = c(
              "We predicted correctly homeowner would not take the credit",
              "We predicted correctly homeowner would enter credit program",
              "We predicted that a homeowner would not take the credit but they did",
              "We predicted incorrectly homeowner would take the credit")))


kable(cost_benefit_table, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 5-1. Cost/Benefit Table of 50% thresholds</span></center>",
      align = "c") %>% 
      kable_minimal(full_width = T, html_font = "Arial", font_size = 15) %>%
      column_spec(1, bold = TRUE) %>%
      row_spec(c(2,4), background = greyPalette5[1]) %>%
      row_spec(1:4, extra_css = "line-height: 35px") 

Table 5-1. Cost/Benefit Table of 50% thresholds
Variable	Count	Revenue	Description
True_Negative	1270	0	We predicted correctly homeowner would not take the credit
True_Positive	37	458800	We predicted correctly homeowner would enter credit program
False_Negative	118	0	We predicted that a homeowner would not take the credit but they did
False_Positive	15	-42750	We predicted incorrectly homeowner would take the credit

5.2 Optimize Threshholds

In this section, we’ll apply cost-benefit equations to our regression model to determine revenue at each threshold. Figure 5-2-1 shows how much the cost is incurred for each threshold. Up to the threshold of 0.17 you can see that the cost reduction effect of decreasing False Negative and the revenue reduction effect of True Positive decrease offset each other, but if the threshold increases further, the revenue decrease faster than that of False Negative.

iterateThresholds <- function(data) {
  x = .01
  all_prediction <- data.frame()
  while (x <= 1) {
    this_prediction <- testProbs_2 %>%
      mutate(predOutcome = ifelse(Probs > x, 1, 0)) %>%
      count(predOutcome, Outcome) %>%
      summarize(True_Negative = sum(n[predOutcome==0 & Outcome==0]),
                True_Positive = sum(n[predOutcome==1 & Outcome==1]),
                False_Negative = sum(n[predOutcome==0 & Outcome==1]),
                False_Positive = sum(n[predOutcome==1 & Outcome==0])) %>%
       gather(Variable, Count) %>%
       mutate(Revenue =
               ifelse(Variable == "True_Negative", Count * 0,
               ifelse(Variable == "True_Positive",(Count * (-2850-5000*0.25+10000*0.25+56000*0.25)),
               ifelse(Variable == "False_Negative", Count * 0,
               ifelse(Variable == "False_Positive", Count * (-2850), 0)))),
              Threshold = x)
    all_prediction <- rbind(all_prediction, this_prediction)
    x <- x + .01
  }
return(all_prediction)
}

whichThreshold <- iterateThresholds(testProbs2)

whichThreshold %>%
  ggplot(.,aes(Threshold, Revenue, colour = Variable)) +
  geom_point() +
  scale_colour_manual(values = bluePalette5[2:5]) +    
  labs(title = "Revenue by confusion matrix type and threshold",
       y = "Revenue",
       caption = "Figure 5-2-1") +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.35, color = greyPalette5[2]),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.35, color = greyPalette5[2]),
        legend.position = "bottom",
        legend.background = element_rect(fill = "white")) +
  guides(colour=guide_legend(title = "Confusion Matrix")) 

In Figure 5-2-3, as the threshold increases, there are more and more people who receive the actual subsidy, but in Figure 5-2-2, we can see that when the threshold is 0.17, we get the most revenue. It seems that it can be an important indicator of judgment when it is aimed at maximizing the effectiveness of using limited resources.

whichThreshold_revenue_subsidy <- 
whichThreshold %>% 
    mutate(Credits = 
             ifelse(Variable == "True_Positive", Count*0.25,
             ifelse(Variable == "False_Negative", Count, 0))) %>%
    group_by(Threshold) %>% 
    summarize(Revenue = sum(Revenue)/1000,
              Total_Count_of_Credits = sum(Credits))

gg522 <- ggplot(whichThreshold_revenue_subsidy, aes(Threshold, Revenue)) +
  geom_point(color = bluePalette5[3], size = 1.2) +
  scale_colour_manual(values = bluePalette5) +    
  labs(title = "Revenue by threshold",
       y = "Revenue(1,000$)",
       caption = "Figure 5-2-2") +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
        legend.position = "bottom",
        axis.text.y = element_text(angle = 90)) +
  guides(colour=guide_legend(title = "Confusion Matrix")) 

gg523 <- ggplot(whichThreshold_revenue_subsidy, aes(Threshold, Total_Count_of_Credits)) +
  geom_point(color = bluePalette5[4], size = 1.2) +
  scale_colour_manual(values = bluePalette5) +
  scale_y_continuous(position = "right")+
  labs(title = "Total Counts of Credits by threshold",
       y = "Total Counts of Credits",
       caption = "Figure 5-2-3") +
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
        legend.position = "bottom",
        axis.text.y = element_text(angle = 270)) +
  guides(colour=guide_legend(title = "Confusion Matrix")) 


gg522 + gg523

5.3 Comparison table of 50% threshold and and optimal threshold

The threshold is a baseline for how to analyze each probability obtained as a result of a logistic regression. Intuitively, it seems that if the probability from the predictive model exceeds 50%, it can be interpreted as taking subsidies, but this is not an indicator of the suitability of the model. In our cost/profit model, the maximum profit can be generated only by maximizing the True positive and minimizing the False positive. Here we can think of Sensitivity again. Sensitivity, which also called True Positive Rate, refers to the proportion of actual subsidies among the results of predicting receiving subsidies. Figure 5-3-1 visualizes the Revenue corresponding to each threshold. It can be seen that the Revenue rises rapidly up to the critical point of 0.17, and then decreases continuously after that. In Table 5-3, the actual return of 50% and the optimal threshold of 17% were compared with the recipient of the subsidy. The overall profit differs by about $339,400, and the optimal threshold of 17% is 181% of the revenue when the threshold is 50%. This can be seen as a meaningful level.

whichThreshold_revenue_subsidy <- 
whichThreshold %>% 
    mutate(Credits = 
             ifelse(Variable == "True_Positive", Count*0.25,
             ifelse(Variable == "False_Negative", Count, 0))) %>%
    group_by(Threshold) %>% 
    summarize(Revenue = sum(Revenue),
              Total_Count_of_Credits = sum(Credits))


finalTable <- whichThreshold_revenue_subsidy %>%
  filter(Threshold >= 0.50 & Threshold < 0.51 | Threshold >= pull(arrange(whichThreshold_revenue_subsidy, -Revenue)[1,1]) & Threshold < pull(arrange(whichThreshold_revenue_subsidy, -Revenue)[1,1])+0.01 )

kable(finalTable, caption = "<center><span style='font-size:14px; color:black; font-family: Arial'>Table 5-3. Cost/Benefit Comparison Table</span></center>",
      align = "c") %>% 
      kable_minimal(full_width = T, html_font = "Arial", font_size = 16)%>%
      column_spec(1:2, bold = TRUE) %>%
      row_spec(1, color = orangePalette5[5]) %>%
      row_spec(2, color = bluePalette5[4]) %>%
      row_spec(1:2, extra_css = "line-height: 30px") 

Table 5-3. Cost/Benefit Comparison Table
Threshold	Revenue	Total_Count_of_Credits
0.17	755450	95.00
0.50	416050	127.25

ggplot(whichThreshold_revenue_subsidy)+
  geom_line(aes(x = Threshold, y = Revenue), color = greyPalette5[3], size = 0.8)+
  geom_vline(xintercept =  pull(arrange(whichThreshold_revenue_subsidy, -Revenue)[1,1]), color = orangePalette5[5], size = 0.8, linetype = "longdash")+
  geom_vline(xintercept = 0.5, color = bluePalette5[4], size = 0.8, linetype = "longdash")+
  scale_x_continuous(minor_breaks = seq(0, 1, 0.05))+
  labs(title = "Model Revenues By Threshold",
       subtitle = "Orange line : 0.17 Optimal Threshold           Blue line : 0.5 Threshold",
       caption = "Figure 5-3-1")+
  corTheme2()+
  theme(panel.background = element_blank(),
        panel.grid.major.y = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
        panel.grid.major.x = element_line(linetype = "dashed", size = 0.45, color = greyPalette5[2]),
        panel.grid.minor.x = element_line(linetype = "dotted", size = 0.45, color = greyPalette5[2]),
        legend.position = "bottom",
        axis.text.y = element_text(angle = 90))

Ⅳ. Conclustion

The purpose of this analysis is not to allocate limited resources at random, but to allocate resources strategically for maximum policy effect. Of course, this is a fairly narrow-minded analysis because it analyzes policies only in terms of cost and benefit. If we allocated resources randomly, the sensitivity would be 50%. However, according to CV tests, we have almost 90% more sensitivity, which means we have almost exactly identified who will receive the subsidy. In addition, in each case, we estimated costs and benefits to derive thresholds that make up the most profitable groups in practice. This can be a very useful tool of judgment in terms of cost and profit.

Targeting A Housing Subsidy

Minwook Kang

2022-11-03