### BACHELOR'S THESIS - R CODE - JAKUB STRAĆ LIPKA ### INITIAL STEPS ------------------------------------------------------------ #libraries library(plyr) library(dplyr) library(car) library(aod) library(lmtest) library(lubridate) library(tidyverse) library(hrbrthemes) library(scales) library(ggpubr) library(ggalt) library(ggcorrplot) library(corrplot) library(pedometrics) library(blorr) library(stargazer) library(pscl) library(mfx) library(miscTools) library(caret) library(caTools) library(ROCR) library(vip) library(ggplot2) library(jtools) library(psych) #working directory setwd("...") ### DATA UPLOAD AND CLEANING ------------------------------------------------------------------------ #read raw data dataraw<-read.csv("...\\Book1.csv",sep = ';') #remove observations with NA's in dependent variable dataraw <- dataraw %>% filter(!is.na(NPL)) dataraw <- dataraw %>% rename(DSTI=44) #read and modify inflation data inflation <-read.csv("...\\inflation.csv",sep = ';') inflation <- inflation %>% rename('month' = 1, 'year' = 2) inflation <- inflation %>% select(-c(4,5,6,7,8)) #combine yr, mth, day into date of loan origination and date of default dataraw <- dataraw %>% mutate('origdate' = make_date(year=origyear, month=origmonth, day=origday)) dataraw <- dataraw %>% mutate('NPLdate' = make_date(year=NPLyear, month=NPLmonth, day=NPLday)) dataraw <- dataraw %>% left_join(x=dataraw, y=inflation, by = c('origmonth'='month', 'origyear'='year')) dataraw <- dataraw %>% mutate(income_real = income/cpi) #create copy of raw data dataraw1 <- dataraw #set negative values equal to 0 dataraw1$LTV_cont[dataraw$LTV_cont < 0] <- 0 dataraw1$DSTI[dataraw$DSTI < 0] <- 0 #dataraw1$DTI_init[dataraw$DTI_init < 0] <- 0 #summary(dataraw) #split into NPL's and performing nplraw <- dataraw1 %>% filter(NPL == 1) performingraw <- dataraw1 %>% filter(NPL == 0) #filter based on expert opinion data1 <- filter(dataraw1, LTV_cont < 5 & LTV_cont >=0 & DSTI < 2 & DSTI >= 0 & income_real < 5000000 & income_real >= 0 ) npl1 <- data1 %>% filter(NPL == 1) #check extremes in this case quantile(data1$LTV_cont, 0.99) quantile(data1$LTV_cont, 0.999) mean(data1$LTV_cont)+3*sd(data1$LTV_cont) quantile(data1$DSTI, 0.99) quantile(data1$DSTI, 0.999) mean(data1$DSTI)+3*sd(data1$DSTI) quantile(data1$income, 0.99) quantile(data1$income, 0.999) mean(data1$income)+3*sd(data1$income) #filter from original data that lie within 3 standard deviations (sd value from data1); those observations with 0 or 1 NA data2 <- filter(dataraw1, !is.na(YR_last_renew) & #we know reconstruction year (LTV_init <= 1 | is.na(LTV_init)) & #initial LTV of 1 or less or NA (LTV_cont < mean(data1$LTV_cont)+3*sd(data1$LTV_cont) | is.na(LTV_cont)) & #choose contemporaneous LTV within 3sd from mean or NA (income < mean(data1$income)+3*sd(data1$income) | is.na(income)) & # -||- income (DSTI < mean(data1$DSTI)+3*sd(data1$DSTI) | is.na(DSTI)) # -||- DSTI ) #select only relevant columns data2 <- data2 %>% dplyr::select(NPL , LTV_init , LTV_cont , DSTI , DTI_init , IR , float , matur_doba_splaceni , loanageless5 , income , income_real , ageless36 , married , university , unemployed , entrepreneur , new_client , bohemia , rental , flat , built1980 , built2015 , YR_last_renew , EPC , EPC_AB , EPC_EFG , if_EPC_real , origdate , NPLdate ) #sum NA's in each row data2$countNA <- rowSums(is.na(data2)) npl2 <- data2 %>% filter(NPL == 1) #remove observations with more than 1 missing explanatory variable data3 <- data2 %>% filter((NPL == 0 & countNA <= 2) | (NPL == 1 & countNA <= 1)) #replace missing values by medians data3 <- data3 %>% mutate(across(c(income, income_real, matur_doba_splaceni, LTV_init, LTV_cont, DSTI), ~replace_na(., median(., na.rm=TRUE)))) #auxiliary variable data3 <- data3 %>% mutate(if_EPC_estim = (ifelse(if_EPC_real == 1, 0, 1))) summary(data3) #split into performing and non-performing npl3 <- data3 %>% filter(NPL == 1) performing3 <- data3 %>% filter(NPL == 0) #summary stats per EPC group data_ab <- data3 %>% filter(EPC_AB == 1) summary(data_ab) data_cd <- data3 %>% filter(EPC_AB == 0 & EPC_EFG == 0) summary(data_cd) data_efg <- data3 %>% filter(EPC_EFG == 1) summary(data_efg) #shift income by 1 CZK for use of logarithm data3$income <- data3$income + 1 data3$income_real <- data3$income_real + 1 ### END OF DATA CLEANING --------------------------------------------------------- summary(data3) ### PLOTS ------------------------------------------------------------------------ #plot of time distribution of origination and defaults plot1 <- data.frame(type = c(rep('Origination ? Performing',nrow(performing3)), rep('Origination ? Defaulted',nrow(npl3)), rep('Defaults',nrow(npl3))), value = c(performing3$origdate ,npl3$origdate, npl3$NPLdate)) ggplot(plot1, aes(x=value)) + geom_histogram(aes(fill = type), bins = 23, color = 'grey90', size = 0.1) + facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = "none") + scale_x_date(name = element_blank(), date_breaks = '2 years', date_minor_breaks = "1 year", date_labels = '%Y') + scale_y_continuous(name = 'Number of Loans') #scatterplot for outlier identification -- computationally intensive plot2 <- data.frame(type = c(rep('Performing',nrow(performingraw)), rep('Defaulted',nrow(nplraw)), rep('Performing',nrow(performingraw)), rep('Defaulted',nrow(nplraw)), rep('Performing',nrow(performingraw)), rep('Defaulted',nrow(nplraw)), rep('Performing',nrow(performingraw)), rep('Defaulted',nrow(nplraw)), rep('Performing',nrow(performingraw)), rep('Defaulted',nrow(nplraw))), var = c(rep('LTV Initial',nrow(dataraw1)), rep('LTV Contemporaneous', nrow(dataraw1)),rep('DSTI', nrow(dataraw1)), rep('IR', nrow(dataraw1)), rep('Income', nrow(dataraw1))), value = c(performingraw$LTV_init ,nplraw$LTV_init, performingraw$LTV_cont ,nplraw$LTV_cont, performingraw$DSTI, nplraw$DSTI, performingraw$IR, nplraw$IR, performingraw$income, nplraw$income) ) #too complex and slow #ggplot(data=plot2, aes(x=1, y=value, color=type)) + # geom_jitter() + # theme_bw() + # facet_grid(var~., scales = 'free') + # theme(legend.title = element_blank(), legend.position = "bottom", axis.title = element_blank(), axis.ticks = element_blank(), axis.text.x = element_blank()) + # scale_y_continuous(labels = label_number()) #EPC distribution and NPL growth plot3 <- data.frame(type = c(rep("Total", 10), rep("Actual", 10)), group = c(rep('EPC Class', 7), rep('EPC Group', 3)), label = c('A','B','C','D','E','F','G','AB','CD','EFG','A','B','C','D','E','F','G','AB','CD','EFG'), count = c(nrow(data3 %>% filter(EPC == 'A ')), nrow(data3 %>% filter(EPC == 'B ')), nrow(data3 %>% filter(EPC == 'C ')), nrow(data3 %>% filter(EPC == 'D ')), nrow(data3 %>% filter(EPC == 'E ')), nrow(data3 %>% filter(EPC == 'F ')), nrow(data3 %>% filter(EPC == 'G ')), nrow(data3 %>% filter(EPC == 'A ' | EPC == 'B ')), nrow(data3 %>% filter(EPC == 'C ' | EPC == 'D ')), nrow(data3 %>% filter(EPC == 'E ' | EPC == 'F ' | EPC == 'G ')), nrow(data3 %>% filter(EPC == 'A ' & if_EPC_real == 1)), nrow(data3 %>% filter(EPC == 'B ' & if_EPC_real == 1)), nrow(data3 %>% filter(EPC == 'C ' & if_EPC_real == 1)), nrow(data3 %>% filter(EPC == 'D ' & if_EPC_real == 1)), nrow(data3 %>% filter(EPC == 'E ' & if_EPC_real == 1)), nrow(data3 %>% filter(EPC == 'F ' & if_EPC_real == 1)), nrow(data3 %>% filter(EPC == 'G ' & if_EPC_real == 1)), nrow(data3 %>% filter((EPC == 'A ' | EPC == 'B ') & if_EPC_real == 1)), nrow(data3 %>% filter((EPC == 'C ' | EPC == 'D ') & if_EPC_real == 1)), nrow(data3 %>% filter((EPC == 'E ' | EPC == 'F ' | EPC == 'G ') & if_EPC_real == 1))), nplcount = c(nrow(npl3 %>% filter(EPC == 'A ')), nrow(npl3 %>% filter(EPC == 'B ')), nrow(npl3 %>% filter(EPC == 'C ')), nrow(npl3 %>% filter(EPC == 'D ')), nrow(npl3 %>% filter(EPC == 'E ')), nrow(npl3 %>% filter(EPC == 'F ')), nrow(npl3 %>% filter(EPC == 'G ')), nrow(npl3 %>% filter(EPC == 'A ' | EPC == 'B ')), nrow(npl3 %>% filter(EPC == 'C ' | EPC == 'D ')), nrow(npl3 %>% filter(EPC == 'E ' | EPC == 'F ' | EPC == 'G ')), nrow(npl3 %>% filter(EPC == 'A ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'B ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'C ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'D ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'E ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'F ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'G ' & if_EPC_real == 1)), nrow(npl3 %>% filter((EPC == 'A ' | EPC == 'B ') & if_EPC_real == 1)), nrow(npl3 %>% filter((EPC == 'C ' | EPC == 'D ') & if_EPC_real == 1)), nrow(npl3 %>% filter((EPC == 'E ' | EPC == 'F ' | EPC == 'G ') & if_EPC_real == 1))), ratio = c(nrow(npl3 %>% filter(EPC == 'A '))/nrow(data3 %>% filter(EPC == 'A ')), nrow(npl3 %>% filter(EPC == 'B '))/nrow(data3 %>% filter(EPC == 'B ')), nrow(npl3 %>% filter(EPC == 'C '))/nrow(data3 %>% filter(EPC == 'C ')), nrow(npl3 %>% filter(EPC == 'D '))/nrow(data3 %>% filter(EPC == 'D ')), nrow(npl3 %>% filter(EPC == 'E '))/nrow(data3 %>% filter(EPC == 'E ')), nrow(npl3 %>% filter(EPC == 'F '))/nrow(data3 %>% filter(EPC == 'F ')), nrow(npl3 %>% filter(EPC == 'G '))/nrow(data3 %>% filter(EPC == 'G ')), nrow(npl3 %>% filter(EPC == 'A ' | EPC == 'B '))/nrow(data3 %>% filter(EPC == 'A ' | EPC == 'B ')), nrow(npl3 %>% filter(EPC == 'C ' | EPC == 'D '))/nrow(data3 %>% filter(EPC == 'C ' | EPC == 'D ')), nrow(npl3 %>% filter(EPC == 'E ' | EPC == 'F ' | EPC == 'G '))/nrow(data3 %>% filter(EPC == 'E ' | EPC == 'F ' | EPC == 'G ')), nrow(npl3 %>% filter(EPC == 'A ' & if_EPC_real == 1))/nrow(data3 %>% filter(EPC == 'A ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'B ' & if_EPC_real == 1))/nrow(data3 %>% filter(EPC == 'B ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'C ' & if_EPC_real == 1))/nrow(data3 %>% filter(EPC == 'C ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'D ' & if_EPC_real == 1))/nrow(data3 %>% filter(EPC == 'D ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'E ' & if_EPC_real == 1))/nrow(data3 %>% filter(EPC == 'E ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'F ' & if_EPC_real == 1))/nrow(data3 %>% filter(EPC == 'F ' & if_EPC_real == 1)), nrow(npl3 %>% filter(EPC == 'G ' & if_EPC_real == 1))/ nrow(data3 %>% filter(EPC == 'G ' & if_EPC_real == 1)), nrow(npl3 %>% filter((EPC == 'A ' | EPC == 'B ') & if_EPC_real == 1))/ nrow(data3 %>% filter((EPC == 'A ' | EPC == 'B ') & if_EPC_real == 1)), nrow(npl3 %>% filter((EPC == 'C ' | EPC == 'D ') & if_EPC_real == 1))/nrow(data3 %>% filter((EPC == 'C ' | EPC == 'D ') & if_EPC_real == 1)), nrow(npl3 %>% filter((EPC == 'E ' | EPC == 'F ' | EPC == 'G') & if_EPC_real == 1))/nrow(data3 %>% filter((EPC == 'E ' | EPC == 'F ' | EPC == 'G ') & if_EPC_real == 1))) ) ggplot(data = plot3) + geom_bar(aes(x=label, fill = label, weight = count)) + facet_grid(factor(type, levels = c('Total', 'Actual')) ~ factor(group, levels = c('EPC Class', 'EPC Group')), scales = "free", space = "free_x") + theme_bw() + theme(legend.title = element_blank(), legend.position = "none") + labs(x = 'EPC', y = 'Number of Loans') + scale_fill_manual(values=c("#209E52", "#209E52", "#63B878", "#A3CF89", "#F8CB27", "#F8CB27", '#F79532', '#D82834', '#F16030', '#D82834')) ggplot(data = plot3) + geom_line(aes(x=label,y=ratio,group=1), color = 'black') + geom_point(aes(x=label,y=ratio), color = 'black') + facet_grid(factor(type, levels = c('Total', 'Actual')) ~ factor(group, levels = c('EPC Class', 'EPC Group')), scales = "free", space = "free_x") + theme_bw() + labs(x = 'EPC', y = 'NPL Share') + scale_y_continuous(labels = label_percent()) #LTV cont plot4 <- data.frame(Type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$LTV_cont ,npl3$LTV_cont)) mu4 <- ddply(plot4, "Type", summarise, grp.mean=mean(value)) p4 <- ggplot(plot4, aes(x=value, fill = Type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 17, size = 0.1) + geom_vline(data=mu4, aes(xintercept=grp.mean, color=Type), linetype="dashed", size = 0.7, alpha = 0.7) + #facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") + scale_x_continuous(name = 'LTV Contemporaneous', labels = label_percent()) + scale_y_continuous(name = 'Density') #LTV init plot5 <- data.frame(Type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$LTV_init ,npl3$LTV_init)) mu5 <- ddply(plot5, "Type", summarise, grp.mean=mean(value)) p5 <- ggplot(plot5, aes(x=value, fill = Type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 17, size = 0.1) + geom_vline(data=mu5, aes(xintercept=grp.mean, color=Type), linetype="dashed", size = 0.7, alpha = 0.7) + #facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") + scale_x_continuous(name = 'LTV Initial', labels = label_percent()) + scale_y_continuous(name = element_blank()) #DSTI init plot6 <- data.frame(Type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$DSTI ,npl3$DSTI)) mu6 <- ddply(plot6, "Type", summarise, grp.mean=mean(value)) p6 <- ggplot(plot6, aes(x=value, fill = Type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 17, size = 0.1) + geom_vline(data=mu6, aes(xintercept=grp.mean, color=Type), linetype="dashed", size = 0.7, alpha = 0.7) + #facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") + scale_x_continuous(name = 'DSTI', labels = label_percent()) + scale_y_continuous(name = 'Density') #IR plot7 <- data.frame(Type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$IR ,npl3$IR)) mu7 <- ddply(plot7, "Type", summarise, grp.mean=mean(value)) p7 <- ggplot(plot7, aes(x=value, fill = Type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 17, size = 0.1) + geom_vline(data=mu7, aes(xintercept=grp.mean, color=Type), linetype="dashed", size = 0.7, alpha = 0.7) + #facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") + scale_x_continuous(name = 'Interest Rate (in %)', limits = c(1,7)) + scale_y_continuous(name = 'Density') #DTI init plot8 <- data.frame(Type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$DTI_init ,npl3$DTI_init)) mu8 <- ddply(plot8, "Type", summarise, grp.mean=mean(value)) p8 <- ggplot(plot8, aes(x=value, fill = Type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 17, size = 0.1) + geom_vline(data=mu8, aes(xintercept=grp.mean, color=Type), linetype="dashed", size = 0.7, alpha = 0.7) + #facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") + scale_x_continuous(name = 'DTI', limits = c(0,15)) + scale_y_continuous(name = element_blank()) #DTI init plot9 <- data.frame(Type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$matur_doba_splaceni ,npl3$matur_doba_splaceni)) mu9 <- ddply(plot9, "Type", summarise, grp.mean=mean(value)) p9 <- ggplot(plot9, aes(x=value, fill = Type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 17, size = 0.1) + geom_vline(data=mu9, aes(xintercept=grp.mean, color=Type), linetype="dashed", size = 0.7, alpha = 0.7) + #facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") + scale_x_continuous(name = 'Maturity (in months)', limits = c(0,400)) + scale_y_continuous(name = element_blank()) ggarrange(p4,p5,p6,p8,p7,p9, ncol = 2, nrow = 3, common.legend = T, legend = 'bottom') #binary loan characteristics plot10 <- data.frame(Type = c(rep('Performing'), rep('Defaulted')), value = c(nrow(performing3 %>% filter(float == 1))/nrow(performing3), nrow(npl3 %>% filter(float == 1))/nrow(npl3), nrow(performing3 %>% filter(loanageless5 == 1))/nrow(performing3), nrow(npl3 %>% filter(loanageless5 == 1))/nrow(npl3)), var = c('float', 'float', 'loan_age_less5', 'loan_age_less5')) ggplot(plot10, aes(y = factor(var, levels = c('loan_age_less5', 'float')), x = value)) + geom_line() + geom_point(aes(color = Type), size = 3) + theme_bw() + theme(legend.title = element_blank(), legend.position = 'bottom') + scale_x_continuous(name = "Share of 'Yes'", limits = c(0,1), labels = label_percent()) + labs(y='') #binary borrower characteristics plot11 <- data.frame(Type = c(rep('Performing'), rep('Defaulted')), value = c(nrow(performing3 %>% filter(ageless36 == 1))/nrow(performing3), nrow(npl3 %>% filter(ageless36 == 1))/nrow(npl3), nrow(performing3 %>% filter(married == 1))/nrow(performing3), nrow(npl3 %>% filter(married == 1))/nrow(npl3), nrow(performing3 %>% filter(university == 1))/nrow(performing3), nrow(npl3 %>% filter(university == 1))/nrow(npl3), nrow(performing3 %>% filter(entrepreneur == 1))/nrow(performing3), nrow(npl3 %>% filter(entrepreneur == 1))/nrow(npl3), nrow(performing3 %>% filter(new_client == 1))/nrow(performing3), nrow(npl3 %>% filter(new_client == 1))/nrow(npl3)), var = c('cust_age_less36', 'cust_age_less36', 'married', 'married', 'university', 'university', 'entrepreneur', 'entrepreneur', 'new_client', 'new_client')) ggplot(plot11, aes(y = factor(var, levels = c('new_client', 'entrepreneur', 'university', 'married', 'cust_age_less36')), x = value)) + geom_line() + geom_point(aes(color = Type), size = 3) + theme_bw() + theme(legend.title = element_blank(), legend.position = 'bottom') + scale_x_continuous(name = "Share of 'Yes'", limits = c(0,1), labels = label_percent()) + labs(y='') #income plot12 <- data.frame(Type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$income ,npl3$income)) mu12 <- ddply(plot12, "Type", summarise, grp.mean=mean(value)) p12 <- ggplot(plot12, aes(x=value, fill = Type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 27, size = 0.1) + geom_vline(data=mu12, aes(xintercept=grp.mean, color=Type), linetype="dashed", size = 0.7, alpha = 0.7) + #facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = 'bottom') + scale_x_continuous(name = 'Income') + scale_y_continuous(name = 'Density') plot12a <- data.frame(Type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$income_real ,npl3$income_real)) mu12a <- ddply(plot12a, "Type", summarise, grp.mean=mean(value)) p12a <- ggplot(plot12a, aes(x=value, fill = Type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 27, size = 0.1) + geom_vline(data=mu12a, aes(xintercept=grp.mean, color=Type), linetype="dashed", size = 0.7, alpha = 0.7) + #facet_grid(type~., scales = 'free') + theme_bw() + theme(legend.title = element_blank(), legend.position = 'bottom') + scale_x_continuous(name = 'Real income') + scale_y_continuous(name = element_blank()) ggarrange(p12,p12a, ncol = 2, nrow = 1, common.legend = T, legend = 'bottom') #binary property characteristics plot13 <- data.frame(Type = c(rep('Performing'), rep('Defaulted')), value = c(nrow(performing3 %>% filter(bohemia == 1))/nrow(performing3), nrow(npl3 %>% filter(bohemia == 1))/nrow(npl3), nrow(performing3 %>% filter(rental == 1))/nrow(performing3), nrow(npl3 %>% filter(rental == 1))/nrow(npl3), nrow(performing3 %>% filter(flat == 1))/nrow(performing3), nrow(npl3 %>% filter(flat == 1))/nrow(npl3), nrow(performing3 %>% filter(built1980 == 1))/nrow(performing3), nrow(npl3 %>% filter(built1980 == 1))/nrow(npl3), nrow(performing3 %>% filter(built2015 == 1))/nrow(performing3), nrow(npl3 %>% filter(built2015 == 1))/nrow(npl3)), var = c('bohemia', 'bohemia', 'rental', 'rental', 'apartment', 'apartment', 'built_bef1980', 'built_bef1980', 'built_aft2014', 'built_aft2014')) ggplot(plot13, aes(y = factor(var, levels = c('built_aft2014', 'built_bef1980', 'apartment', 'rental', 'bohemia')), x = value)) + geom_line() + geom_point(aes(color = Type), size = 3) + theme_bw() + theme(legend.title = element_blank(), legend.position = 'bottom') + scale_x_continuous(name = "Share of 'Yes'", limits = c(0,1), labels = label_percent()) + labs(y='') #building last rebuilt plot14 <- data.frame(type = c(rep('Performing',nrow(performing3)), rep('Defaulted',nrow(npl3))), value = c(performing3$YR_last_renew ,npl3$YR_last_renew)) mu14 <- ddply(plot14, "type", summarise, grp.mean=mean(value)) ggplot(plot14, aes(x=value, fill = type)) + geom_histogram(aes(y=..density..), alpha = 0.3, color = 'grey90', position = 'identity', bins = 21, size = 0.1) + geom_vline(data=mu14, aes(xintercept=grp.mean, color=type), linetype="dashed", size = 0.7, alpha = 0.7) + theme_bw() + theme(legend.title = element_blank(), legend.position = "bottom") + scale_x_continuous(name = element_blank(), limits = c(1825,2025)) + scale_y_continuous(name = 'Density') #+ #facet_grid(type~., scales = 'free') ### MODELS ------------------------------------------------------------------------------------------------- # AB EPC classes ----------------------------------------------------------------------------------- logit_solo_ab <- glm(NPL ~ EPC_AB, family = binomial (link = 'logit'), data = data3) summary(logit_solo_ab) logit_ab_noyr <- glm( NPL ~ EPC_AB #+ EPC_EFG #+ built2015 #+ built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit_ab_noyr) logit_ab <- glm( NPL ~ EPC_AB #+ EPC_EFG + built2015 + built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit_ab) summ(logit_ab) #data3$EPC_AB <- as.factor(data3$EPC_AB) #effect_plot(logit_ab, pred =EPC_AB, interval = TRUE, y.label = "Probability of default") # EFG EFG classes ----------------------------------------------------------------------------------- logit_solo_efg <- glm(NPL ~ EPC_EFG, family = binomial (link = 'logit'), data = data3) summary(logit_solo_efg) logit_efg_noyr <- glm( NPL ~ #EPC_AB + EPC_EFG #+ built2015 #+ built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit_efg_noyr) logit_efg <- glm( NPL ~ #EPC_AB + EPC_EFG + built2015 + built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit_efg) # complex ----------------------------------------------------------------------------------- logit_noyr <- glm( NPL ~ EPC_AB + EPC_EFG #+ built2015 #+ built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit_noyr) logit_justyr <- glm( NPL ~ #EPC_AB #+ EPC_EFG + built2015 + built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit_justyr) #all variables logit <- glm( NPL ~ EPC_AB + EPC_EFG + built2015 + built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit) pR2(logit_solo_efg) # EXPORT INTO LATEX --------------------------------------------------------------------- stargazer(logit, logit_noyr, logit_justyr, covariate.labels = c('EPC class A, B (Y/N)', 'EPC class E, F, G (Y/N)', '(Re)built after 2014 (Y/N)', '(Re)built before 1980 (Y/N)', 'Contemporaneous LTV', 'Initial DSTI', 'Interest rate', 'Adjustable rate (Y/N)', 'Loan maturity', 'Loan age less 5 yrs (Y/N)', 'Income (log)', 'Customer age less 36 yrs (Y/N)', 'Married (Y/N)', 'University (Y/N)', 'Entrepreneur (Y/N)', 'New client (Y/N)', 'Bohemia (Y/N)', 'Rental property (Y/N)', 'Apartment (Y/N)', 'Intercept' ), title = "Regression Results", align = TRUE, font.size = "scriptsize", header = FALSE, type = "latex", no.space = T, digit.separator = ' ') stargazer(logit_solo_ab, logit_ab_noyr, logit_ab, covariate.labels = c('EPC class A, B (Y/N)', #'EPC class E, F, G (Y/N)', '(Re)built after 2014 (Y/N)', '(Re)built before 1980 (Y/N)', 'Contemporaneous LTV', 'Initial DSTI', 'Interest rate', 'Adjustable rate (Y/N)', 'Loan maturity', 'Loan age less 5 yrs (Y/N)', 'Income (log)', 'Customer age less 36 yrs (Y/N)', 'Married (Y/N)', 'University (Y/N)', 'Entrepreneur (Y/N)', 'New client (Y/N)', 'Bohemia (Y/N)', 'Rental property (Y/N)', 'Apartment (Y/N)', 'Intercept' ), title = "Regression Results", align = TRUE, font.size = "scriptsize", header = FALSE, type = "latex", no.space = T, digit.separator = ' ') stargazer(logit_solo_efg, logit_efg_noyr, logit_efg, covariate.labels = c(#'EPC class A, B (Y/N)', 'EPC class E, F, G (Y/N)', '(Re)built after 2014 (Y/N)', '(Re)built before 1980 (Y/N)', 'Contemporaneous LTV', 'Initial DSTI', 'Interest rate', 'Adjustable rate (Y/N)', 'Loan maturity', 'Loan age less 5 yrs (Y/N)', 'Income (log)', 'Customer age less 36 yrs (Y/N)', 'Married (Y/N)', 'University (Y/N)', 'Entrepreneur (Y/N)', 'New client (Y/N)', 'Bohemia (Y/N)', 'Rental property (Y/N)', 'Apartment (Y/N)', 'Intercept' ), title = "Regression Results", align = TRUE, font.size = "scriptsize", header = FALSE, type = "latex", no.space = T, digit.separator = ' ') #MARGINAL EFFECTS ---------------------------------------------------------------------- pred_no <- data3[-c(24,28,29)] pred_no <- pred_no %>% ungroup %>% add_row(!!! colMeans(.)) pred_no[125642,24]=0 pred_no[125642,25]=0 pred_no<-pred_no[125642,] pred_yes <- data3[-c(24,28,29)] pred_yes <- pred_yes %>% ungroup %>% add_row(!!! colMeans(.)) pred_yes[125642,24]=1 pred_yes[125642,25]=0 pred_yes<-pred_yes[125642,] predict(logit, pred_yes, type='response') - predict(logit, pred_no, type='response') # OFFICIAL EPCs ------------------------------------------------------------------------------- data_EPC_real <- filter(data3, if_EPC_real == 1) npl_EPC_real <- filter(data_EPC_real, NPL == 1) logit_offic_ab_solo <- glm( NPL ~ EPC_AB, family = binomial (link = 'logit'), data = data_EPC_real ) summary(logit_offic_ab_solo) logit_offic_ab <- glm( NPL ~ EPC_AB + LTV_cont + DSTI , family = binomial (link = 'logit'), data = data_EPC_real ) summary(logit_offic_ab) logit_offic_efg_solo <- glm( NPL ~ EPC_EFG, family = binomial (link = 'logit'), data = data_EPC_real ) summary(logit_offic_efg_solo) logit_offic_efg <- glm( NPL ~ EPC_EFG + LTV_cont + DSTI, family = binomial (link = 'logit'), data = data_EPC_real ) summary(logit_offic_efg) stargazer(#logit_offic_ab_solo, logit_offic_ab, logit_offic_efg_solo, logit_offic_efg, covariate.labels = c(#'EPC class A, B (Y/N)', 'EPC class E, F, G (Y/N)', 'Contemporaneous LTV', 'Initial DSTI', 'Intercept' ), title = "Regression Results", align = TRUE, font.size = "scriptsize", header = FALSE, type = "text", no.space = T, digit.separator = ' ') pR2(logit_offic_efg) # ROBUSTNESS TESTS ------------------------------------------------------------ lowIR <- data3 %>% filter(IR < 3.5) logit_ab_IR <- glm( NPL ~ EPC_AB #+ EPC_EFG + built2015 + built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = lowIR ) summary(logit_ab_IR) logit_ab_LTV_init <- glm( NPL ~ EPC_AB #+ EPC_EFG + built2015 + built1980 + LTV_init + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit_ab_LTV_init) logit_ab_realinc <- glm( NPL ~ EPC_AB #+ EPC_EFG + built2015 + built1980 + LTV_cont + DSTI #+ DTI_init + IR + float + matur_doba_splaceni + loanageless5 + log(income_real) + ageless36 + married + university + entrepreneur + new_client + bohemia + rental + flat , family = binomial (link = 'logit'), data = data3 ) summary(logit_ab_realinc) stargazer(logit_ab, logit_ab_IR, logit_ab_LTV_init, logit_ab_realinc, covariate.labels = c('EPC class A, B (Y/N)', '(Re)built after 2014 (Y/N)', '(Re)built before 1980 (Y/N)', 'Contemporaneous LTV', 'Initial LTV', 'Initial DSTI', 'Interest rate', 'Adjustable rate (Y/N)', 'Loan maturity', 'Loan age less 5 yrs (Y/N)', 'Income (log)', 'Real income (log)', 'Customer age less 36 yrs (Y/N)', 'Married (Y/N)', 'University (Y/N)', 'Entrepreneur (Y/N)', 'New client (Y/N)', 'Bohemia (Y/N)', 'Rental property (Y/N)', 'Apartment (Y/N)', 'Intercept' ), title = "Regression Results", align = TRUE, font.size = "scriptsize", header = FALSE, type = "latex", no.space = T, digit.separator = ' ') pR2(logit_ab_realinc) # CORRELATION MATRIX ----------------------------------------------------------- used_cols <- data3 %>% dplyr::select(NPL , LTV_cont , DSTI , IR , float , matur_doba_splaceni , loanageless5 , income , ageless36 , married , university #, unemployed , entrepreneur , new_client , bohemia , rental , flat , built1980 , built2015 , EPC_AB , EPC_EFG) summary(used_cols) corr <- cor(used_cols) p.mat <- cor_pmat(used_cols) ggcorrplot(corr, ggtheme = ggplot2::theme_bw, type = "lower", lab = TRUE,colors = c("#6D9EC1", "white", "#E46726"),digits = 1) #check for phi coefficient ('correlation for binary vars') nrow(data3 %>% dplyr::filter(EPC_AB != built2015 & EPC_AB == 0)) matr <- matrix(c(79881, 10419, 2182, 33159), nrow=2) phi(matr, digits = 5) # VIF ---------------------------------------------------------------- car::vif(logit) blr_coll_diag(logit) ### PLOTTING LINEARITY OF EXPLANATORY VARS AND LOG-ODDS ------------------------------------------------------------------------------- logitaux <- glm(NPL ~ LTV_cont + DSTI + IR + log(income) + matur_doba_splaceni, data = data3, family = 'binomial' (link = 'logit')) summary(logitaux) prob <- predict(logitaux, type = 'response') log_odds <- log(prob/(1-prob)) p15 <- ggplot(data3, aes(LTV_cont, log_odds)) + geom_point(alpha = 0.05) + theme_bw() + scale_y_continuous(name = 'Log-odds') p16 <- ggplot(data3, aes(DSTI, log_odds)) + geom_point(alpha = 0.05) + theme_bw() + scale_y_continuous(name = 'Log-odds') p17 <- ggplot(data3, aes(IR, log_odds)) + geom_point(alpha = 0.05) + theme_bw() + scale_y_continuous(name = 'Log-odds') p18 <- ggplot(data3, aes(log(income), log_odds)) + geom_point(alpha = 0.05) + theme_bw() + scale_y_continuous(name = 'Log-odds') + scale_x_continuous(limits = c(9,12)) p19 <- ggplot(data3, aes(matur_doba_splaceni, log_odds)) + geom_point(alpha = 0.05) + theme_bw() + scale_y_continuous(name = 'Log-odds') #computationally intensive #ggarrange(p15,p16,p17,p18,p19, ncol = 2, nrow = 3)