2019独角兽企业重金招聘Python工程师标准>>>
1、数据源:
我们将会使用在信用评级建模中非常常用的德国信贷数据(German credit dataset)作为建模的数据集。德国信贷数据共有1000条数据,每条数据20个特征。
2、数据源下载:
https://github.com/frankhlchi/R-scorecard
3、建模过程
4、完整版(源代码):
rm(list=ls())
gc()
library(caret)
library(smbinning)
library(ggplot2)
library(woe)
#load the data
german_credit <- read.csv("C:/pic/credit/german_credit.csv",sep = ",")
train <-createDataPartition(y=german_credit$Creditability,p=0.75,list=FALSE)
train2 <- german_credit[train, ]
test2 <- german_credit[-train, ]
#Explore data distribution
ggplot(german_credit, aes(x = Duration,y = ..count..,)) + geom_histogram(fill = "blue", colour = "grey60", size = 0.2, alpha = 0.2,binwidth = 5)
ggplot(german_credit, aes(x = CreditAmount,y = ..count..,)) + geom_histogram(fill = "blue", colour = "grey60", size = 0.2, alpha = 0.2,binwidth = 1000)
ggplot(german_credit, aes(x = Age,y = ..count..,)) + geom_histogram(fill = "blue", colour = "grey60", size = 0.2, alpha = 0.2,binwidth = 5)
ggplot(german_credit, aes(x =Creditability,y = ..count..,)) + geom_histogram(fill = "blue", colour = "grey60" , alpha = 0.2,binwidth = 0.5)
#Optimal Binning
Durationresult=smbinning(df=train2,y="Creditability",x="Duration",p=0.05)
CreditAmountresult=smbinning(df=train2,y="Creditability",x="CreditAmount",p=0.05)
Ageresult=smbinning(df=train2,y="Creditability",x="Age",p=0.05)
smbinning.plot(CreditAmountresult,option="WoE",sub="CreditAmount")
smbinning.plot(Durationresult,option="WoE",sub="Duration")
smbinning.plot(Ageresult,option="WoE",sub="Age")
#Univariate Analysis
AccountBalancewoe=woe(train2, "AccountBalance",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
ggplot(AccountBalancewoe, aes(x = BIN, y = -WOE)) + geom_bar(stat = "identity",fill = "blue", colour = "grey60",size = 0.2, alpha = 0.2)+labs(title = "AccountBalance")
ValueSavingswoe=woe(train2, "ValueSavings",Continuous = F, "Creditability",C_Bin = 5,Good = "1",Bad = "0")
ggplot(ValueSavingswoe, aes(x = BIN, y = -WOE)) + geom_bar(stat = "identity",fill = "blue", colour = "grey60",size = 0.2, alpha = 0.2)+labs(title = "ValueSavings")
Lengthofcurrentemploymentwoe=woe(train2, "Lengthofcurrentemployment",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
ggplot(Lengthofcurrentemploymentwoe, aes(x = BIN, y = -WOE)) + geom_bar(stat = "identity",fill = "blue", colour = "grey60",size = 0.2, alpha = 0.2)+labs(title = "Lengthofcurrentemployment")
#combine some bins
for(i in 1:750){
if(train2$ValueSavings[i]==1){train2$ValueSavings[i]=2}
}
for(i in 1:750){
if(train2$Lengthofcurrentemployment[i]==5){train2$Lengthofcurrentemployment[i]=4}
}
library(corrplot)
#correlation analysis
cor1<-cor(train)
corrplot(cor1,tl.cex = 0.5)
#Infomation Value calculation
for(i in 1:1000){
if(german_credit$Duration[i]<=11){german_credit$Duration[i]=1}
else if(german_credit$Duration[i]<=30){german_credit$Duration[i]=2}
else{german_credit$Duration[i]=3}
}
for(i in 1:1000){
if(german_credit$Age[i]<=25){german_credit$Age[i]=1}
else{german_credit$Age[i]=2}
}
for(i in 1:1000){
if(german_credit$CreditAmount[i]<=6742){german_credit$CreditAmount[i]=1}
else{german_credit$CreditAmount[i]=2}
}
for(i in 1:1000){
if(german_credit$ValueSavings[i]==1){german_credit$ValueSavings[i]=2}
}
for(i in 1:1000){
if(german_credit$Lengthofcurrentemployment[i]==5){german_credit$Lengthofcurrentemployment[i]=4}
}
AccountBalancewoe=woe(train2, "AccountBalance",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Durationwoe=woe(train2, "Duration",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
PaymentStatusofPreviousCreditwoe=woe(train2, "PaymentStatusofPreviousCredit",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Purposewoe = woe(train2, "Purpose",Continuous = F, "Creditability",C_Bin = 11,Good = "1",Bad = "0")
CreditAmountwoe= woe(train2, "CreditAmount",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
ValueSavingswoe =woe(train2, "ValueSavings",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Lengthofcurrentemploymentwoe=woe(train2, "Lengthofcurrentemployment",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Instalmentpercenwoet=woe(train2, "Instalmentpercent",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Sex.Marital.Statuswoe=woe(train2, "Sex.Marital.Status",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Guarantorswoe=woe(train2, "Guarantors",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
DurationinCurrentaddresswoe=woe(train2, "DurationinCurrentaddress",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Mostvaluableavailableassetwoe=woe(train2, "Mostvaluableavailableasset",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Agewoe=woe(train2, "Age",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
ConcurrentCreditswoe=woe(train2, "ConcurrentCredits",Continuous = F, "Creditability",C_Bin = 3,Good = "1",Bad = "0")
Typeofapartmentwoe=woe(train2, "Typeofapartment",Continuous = F, "Creditability",C_Bin = 3,Good = "1",Bad = "0")
NoofCreditatthisBankwoe=woe(train2, "NoofCreditatthisBank",Continuous = F, "Creditability",C_Bin = 3,Good = "1",Bad = "0")
Occupationwoe=woe(train2, "Occupation",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Noofdependentswoe=woe(train2, "Noofdependents",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
Telephonewoe=woe(train2, "Telephone",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
ForeignWorkerwoe=woe(train2, "ForeignWorker",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
va = c("AccountBalance", "Duration", "PaymentStatusofPreviousCredit", "Purpose", "CreditAmount", "ValueSavings", "Lengthofcurrentemployment", "Instalmentpercent","Sex.Marital.Status","Guarantors","DurationinCurrentaddress", "Mostvaluableavailableasset", "Age","ConcurrentCredits","Typeofapartment", "NoofCreditatthisBank","Occupation","Noofdependents", "Telephone" ,"ForeignWorker")
iv=c(sum(AccountBalancewoe$IV),sum(Durationwoe$IV),sum(PaymentStatusofPreviousCreditwoe$IV),sum(Purposewoe$IV),sum(CreditAmountwoe$IV),sum(ValueSavingswoe$IV),sum(Lengthofcurrentemploymentwoe$IV) ,sum(Instalmentpercenwoet$IV) ,sum(Sex.Marital.Statuswoe$IV) ,sum(Guarantorswoe$IV) ,sum(DurationinCurrentaddresswoe$IV) ,sum(Mostvaluableavailableassetwoe$IV),sum(Agewoe$IV),sum(ConcurrentCreditswoe$IV),sum(Typeofapartmentwoe$IV),sum(NoofCreditatthisBankwoe$IV),sum(Occupationwoe$IV), sum(Noofdependentswoe$IV), sum(Telephonewoe$IV),sum(ForeignWorkerwoe$IV))
infovalue = data.frame(va,iv)
ggplot(infovalue, aes(x = va, y = iv)) + geom_bar(stat = "identity",fill = "blue", colour = "grey60",size = 0.2, alpha = 0.2)+labs(title = "Information value")+ theme(axis.text.x=element_text(angle=90,colour="black",size=10));
#WoE transformation
german_credit$DurationinCurrentaddress=NULL
german_credit$Guarantors=NULL
german_credit$Instalmentpercent=NULL
german_credit$NoofCreditatthisBank=NULL
german_credit$Occupation=NULL
german_credit$Noofdependents=NULL
german_credit$Telephone=NULL
AccountBalancewoe=woe(train2, "AccountBalance",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Durationwoe=woe(train2, "Duration",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
PaymentStatusofPreviousCreditwoe=woe(train2, "PaymentStatusofPreviousCredit",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Purposewoe = woe(train2, "Purpose",Continuous = F, "Creditability",C_Bin = 11,Good = "1",Bad = "0")
CreditAmountwoe= woe(train2, "CreditAmount",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
ValueSavingswoe =woe(train2, "ValueSavings",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Lengthofcurrentemploymentwoe=woe(train2, "Lengthofcurrentemployment",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Sex.Marital.Statuswoe=woe(train2, "Sex.Marital.Status",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Mostvaluableavailableassetwoe=woe(train2, "Mostvaluableavailableasset",Continuous = F, "Creditability",C_Bin = 4,Good = "1",Bad = "0")
Agewoe=woe(train2, "Age",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
ConcurrentCreditswoe=woe(train2, "ConcurrentCredits",Continuous = F, "Creditability",C_Bin = 3,Good = "1",Bad = "0")
Typeofapartmentwoe=woe(train2, "Typeofapartment",Continuous = F, "Creditability",C_Bin = 3,Good = "1",Bad = "0")
ForeignWorkerwoe=woe(train2, "ForeignWorker",Continuous = F, "Creditability",C_Bin = 2,Good = "1",Bad = "0")
for(i in 1:1000){
for(s in 1:4){
if(german_credit$AccountBalance[i]==s){
german_credit$AccountBalance[i]=-AccountBalancewoe$WOE[s]
}
}
for(s in 1:3){
if(german_credit$Duration[i]==s){
german_credit$Duration[i]=-Durationwoe$WOE[s]
}
}
for(s in 0:4){
if(german_credit$PaymentStatusofPreviousCredit[i]==s){
german_credit$PaymentStatusofPreviousCredit[i]=-PaymentStatusofPreviousCreditwoe$WOE[s+1]
}
}
for(s in 0:10){
if(s<=6){
if(german_credit$Purpose[i]==s){
german_credit$Purpose[i]=-Purposewoe$WOE[s+1]
}
}else{
if(german_credit$Purpose[i]==s){
german_credit$Purpose[i]=-Purposewoe$WOE[s]
}
}
}
for(s in 1:2){
if(german_credit$CreditAmount[i]==s){
german_credit$CreditAmount[i]=-CreditAmountwoe$WOE[s]
}
}
for(s in 2:5){
if(german_credit$ValueSavings[i]==s){
german_credit$ValueSavings[i]=-ValueSavingswoe$WOE[s-1]
}
}
for(s in 1:5){
if(german_credit$Lengthofcurrentemployment[i]==s){
german_credit$Lengthofcurrentemployment[i]=-Lengthofcurrentemploymentwoe$WOE[s]
}
}
for(s in 1:5){
if(german_credit$Sex.Marital.Status[i]==s){
german_credit$Sex.Marital.Status[i]=-Sex.Marital.Statuswoe$WOE[s]
}
}
for(s in 1:4){
if(german_credit$Mostvaluableavailableasset[i]==s){
german_credit$Mostvaluableavailableasset[i]=-Mostvaluableavailableassetwoe$WOE[s]
}
}
for(s in 1:2){
if(german_credit$Age[i]==s){
german_credit$Age[i]=-Agewoe$WOE[s]
}
}
for(s in 1:5){
if(german_credit$ConcurrentCredits[i]==s){
german_credit$ConcurrentCredits[i]=-ConcurrentCreditswoe$WOE[s]
}
}
for(s in 1:5){
if(german_credit$Typeofapartment[i]==s){
german_credit$Typeofapartment[i]=-Typeofapartmentwoe$WOE[s]
}
}
for(s in 1:2){
if(german_credit$ForeignWorker[i]==s){
german_credit$ForeignWorker[i]=-ForeignWorkerwoe$WOE[s]
}
}
}
#Stepwise regression & Logistic model buiding
fit<-glm(Creditability~ AccountBalance + Duration +PaymentStatusofPreviousCredit +Purpose + CreditAmount + ValueSavings + Lengthofcurrentemployment +Sex.Marital.Status+ Mostvaluableavailableasset + Age + ConcurrentCredits + Typeofapartment + ForeignWorker,train2,family = "binomial")
backwards = step(fit)
summary(backwards)
fit2<-glm(Creditability~ AccountBalance + Duration +PaymentStatusofPreviousCredit +Purpose + CreditAmount + ValueSavings + Lengthofcurrentemployment + Age + ConcurrentCredits + ForeignWorker,train2,family = "binomial")
summary(fit2)
fit3<-glm(Creditability~ AccountBalance + Duration +PaymentStatusofPreviousCredit +Purpose + CreditAmount + ValueSavings + Lengthofcurrentemployment + Age + ForeignWorker,train2,family = "binomial")
summary(fit3)
#VIF testing
library(car)
vif(fit3, digits =3)
#producing confusion matrix
prediction <- predict(fit3,newdata=test2)
for (i in 1:250) {
if(prediction[i]>0.99){
prediction[i]=1}
else
{prediction[i]=0}
}
confusionMatrix(prediction, test2$Creditability)
coe = (fit3$coefficients)
p <- 20/log(2)
q <- 600-20*log(2.5)/log(2)
base <- q + p*as.numeric(coe[1])
#score for AccountBalance
AccountBalanceSCORE = p*as.numeric(coe[2])*AccountBalancewoe$woe[1]*-1
p*as.numeric(coe[2])*AccountBalancewoe$WOE[1]*-1
p*as.numeric(coe[2])*AccountBalancewoe$WOE[2]*-1
p*as.numeric(coe[2])*AccountBalancewoe$WOE[3]*-1
p*as.numeric(coe[2])*AccountBalancewoe$WOE[4]*-1
#score for Duration
p*as.numeric(coe[3])*Durationwoe$WOE[1]*-1
p*as.numeric(coe[3])*Durationwoe$WOE[2]*-1
p*as.numeric(coe[3])*Durationwoe$WOE[3]*-1
#score for PaymentStatusofPreviousCredit
p*as.numeric(coe[4])*PaymentStatusofPreviousCreditwoe$WOE[1]*-1
p*as.numeric(coe[4])*PaymentStatusofPreviousCreditwoe$WOE[2]*-1
p*as.numeric(coe[4])*PaymentStatusofPreviousCreditwoe$WOE[3]*-1
p*as.numeric(coe[4])*PaymentStatusofPreviousCreditwoe$WOE[4]*-1
p*as.numeric(coe[4])*PaymentStatusofPreviousCreditwoe$WOE[5]*-1
#score for purpose
for(i in 1:10){
print(p*as.numeric(coe[5])*Purposewoe$WOE[i])*-1
}
p*as.numeric(coe[6])*CreditAmountwoe$WOE[1]*-1
p*as.numeric(coe[6])*CreditAmountwoe$WOE[2]*-1
p*as.numeric(coe[7])*ValueSavingswoe$WOE[1]*-1
p*as.numeric(coe[7])*ValueSavingswoe$WOE[2]*-1
p*as.numeric(coe[7])*ValueSavingswoe$WOE[3]*-1
p*as.numeric(coe[7])*ValueSavingswoe$WOE[4]*-1
p*as.numeric(coe[8])*Lengthofcurrentemploymentwoe$WOE[1]*-1
p*as.numeric(coe[8])*Lengthofcurrentemploymentwoe$WOE[2]*-1
p*as.numeric(coe[8])*Lengthofcurrentemploymentwoe$WOE[3]*-1
p*as.numeric(coe[8])*Lengthofcurrentemploymentwoe$WOE[4]*-1
p*as.numeric(coe[9])*Agewoe$WOE[1]*-1
p*as.numeric(coe[9])*Agewoe$WOE[2]*-1
p*as.numeric(coe[10])*ForeignWorkerwoe$WOE[1]*-1
p*as.numeric(coe[10])*ForeignWorkerwoe$WOE[2]*-1
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 153
- 154
- 155
- 156
- 157
- 158
- 159
- 160
- 161
- 162
- 163
- 164
- 165
- 166
- 167
- 168
- 169
- 170
- 171
- 172
- 173
- 174
- 175
- 176
- 177
- 178
- 179
- 180
- 181
- 182
- 183
- 184
- 185
- 186
- 187
- 188
- 189
- 190
- 191
- 192
- 193
- 194
- 195
- 196
- 197
- 198
- 199
- 200
- 201
- 202
- 203
- 204
- 205
- 206
- 207
- 208
- 209
- 210
- 211
- 212
- 213
- 214
- 215
- 216
- 217
- 218
- 219
- 220
- 221
- 222
- 223
- 224
- 225
- 226
- 227
- 228
- 229
- 230
- 231
- 232
- 233
- 234
- 235
- 236
- 237
- 238
- 239
- 240
- 241
- 242
- 243
- 244
- 245
- 246
- 247
- 248
- 249
- 250
- 251
- 252
- 253
- 254
- 255
- 256
- 257
- 258
- 259
- 260
- 261
- 262
- 263
- 264
- 265
- 266
- 267
- 268
- 269
- 270
- 271
- 272
- 273
- 274
- 275
- 276
- 277
- 278
- 279
- 280
- 281
- 282
- 283
建模方法总结篇:
1、变量分箱
在评分卡建模中,变量分箱(binning)是对连续变量离散化(discretization)的一种称呼。要将logistic模型转换为标准评分卡的形式,这一环节是必须完成的。信用评分卡开发中一般有常用的等距分段、等深分段、最优分段。
其中等距分段(Equval length intervals)是指分段的区间是一致的,比如年龄以十年作为一个分段;等深分段(Equal frequency intervals)是先确定分段数量,然后令每个分段中数据数量大致相等;最优分段(Optimal Binning)又叫监督离散化(supervised discretizaion),使用递归划分(Recursive Partitioning)将连续变量分为分段,背后是一种基于条件推断查找较佳分组的算法(Conditional Inference Tree)
2、单变量分析
在风险建模的过程中,变量选择可以具体细化为单变量变量筛选 (Univariate Variable Selection)和多变量变量筛选 (Multivariate Variable Selection)。多变量变量筛选一般会利用Stepwise算法在变量池中选取最优变量。 而单变量筛选,或者说单变量分析,是通过比较指标分箱和对应分箱的违约概率来确定指标是否符合经济意义。
3、相关性分析 & IV(信息值)筛选
我们会用经过清洗后的数据看一下变量间的相关性。注意,这里的相关性分析只是初步的检查,进一步检查模型的多重共线性还需要通过 VIF(variance inflation factor)也就是 方差膨胀因子进行检验。
4、StepWise多变量分析 & Logistic模型建立
在进行StepWise分析前,我们需要将筛选后的变量转换为WoE值并建立Logistic模型。
将经过WoE转换的数据放入Logistic模型中建模,并使用向后逐步回归方法
为防止多重共线性问题的出现,我们对模型进行VIF检验:
5、模型检验
到这里,我们的建模部分基本结束了。我们需要验证一下模型的预测能力如何。我们使用在建模开始阶段预留的250条数据进行检验:
模型的精度达到了0.72,模型表现一般。这同Logistic模型本身的局限性有关。传统的回归模型精度一般都会弱于决策树、SVM等机器挖掘算法。
6、打分卡转换
我们在上一部分,我们已经基本完成了建模相关的工作,并用混淆矩阵验证了模型的预测能力。接下来的步骤,就是将Logistic模型转换为标准打分卡的形式。
在建立标准评分卡之前,我们需要选取几个评分卡参数:基础分值、 PDO(比率翻倍的分值)和好坏比。 这里, 我们取600分为基础分值,PDO为20 (每高20分好坏比翻一倍),好坏比取2.5。;可得下式:
620 = q - p * log(2.5)
600 = q - p * log(2.5/2)
p = 20/log(2)
q =600-20*log(2.5)/log(2)
- 1
- 2
- 3
- 4