python機器學習 IV值 WOE值的計算

2021-09-25 23:58:39 字數 4299 閱讀 5955

評分卡模型中的iv和woe詳解

看完後,一定要注意

iv衡量的是某乙個變數的資訊量,從公式來看的話,相當於是自變數woe值的乙個加權求和,其值的大小決定了自變數對於目標變數的影響程度,對於分組 i ,其對應的iv值參考下圖,其中n是分組個數,注意,在變數的任何分組中,不應該出現響應數為0或非響應數字0的情況,當變數的乙個分組的響應數字0時,對應的woe就為負無窮,此時iv值為正無窮。如果可能,直接把這個分組做成乙個規則,作為模型的前置條件或補充條件

推薦另一篇文章

iv值的計算及使用

#呼叫包

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt #用不到

#讀取檔案

io=r』e:\工作\專項\白騎士資料驗證\白騎士資料彙總表.xlsx』

yinka=pd.read_excel(io,sheet_name=『yinka』)

bqs=pd.read_excel(io,sheet_name=『bqs_result』)

yinka_bqs=pd.merge(yinka,bqs,left_on=『no』,right_on=『no』,how=『inner』)

#方法一 求woe值

『』』def calciv(xvar, yvar):

n_0 = np.sum(yvar0)

n_1 = np.sum(yvar1)

n_0_group = np.zeros(np.unique(xvar).shape)

n_1_group = np.zeros(np.unique(xvar).shape)

for i in range(len(np.unique(xvar))):

n_0_group[i] = yvar[(xvar == np.unique(xvar)[i]) & (yvar == 0)].count()

n_1_group[i] = yvar[(xvar == np.unique(xvar)[i]) & (yvar == 1)].count()

iv = np.sum((n_0_group/n_0 - n_1_group/n_1) * np.log((n_0_group/n_0)/(n_1_group/n_1)))

return iv

#求iv值  

def caliv_batch(df, kvar, yvar):

df_xvar = df.drop([kvar, yvar], axis=1)

ivlist =

for col in df_xvar.columns:

iv = calciv(df[col], df[yvar])

names = list(df_xvar.columns)

iv_df = pd.dataframe(, columns=['var', 'iv'])

return iv_df'''

#方法二 求woe值

#求iv值

def calcwoe(dataset,col,targe):

subdata=dataset.groupby(col)[col].count() #在分組

suby=dataset.groupby(col)[targe].sum()

# dataset.groupby([col,targe])[targe].count().unstack()

data=pd.merge(subdata,suby,how="left",left_index=true,right_index=true)

b_total=data[targe].sum()

total=data[col].sum()

g_total=total-b_total

# if data["good"]=0:

# elif data.good ==0:

# data["woe"]=0

return data

# .loc[:,["bad","good","woe"]]

# calcwoe(yinka_bqs,"var1","flag_7")

'''#驗證iv值

def calcwoe(dataset,col,targe):

#dataset,col,targe=yinka_bqs,'var1','flag_7'

a=dataset[[col,targe]].dropna()

subdata=a.groupby(pd.cut(a[col].dropna(),3))[col].count()

suby=a.groupby(pd.cut(a[col].dropna(),3))[targe].sum()

data=pd.merge(subdata,suby,how="left",left_index=true,right_index=true)

b_total=data[targe].sum()

total=data[col].sum()

g_total=total-b_total

return data

'''def calciv(dataset):

iv=sum(dataset["iv"])

return iv

if __name__ == '__main__':

a=calcwoe(yinka_bqs,"var3","flag_7")

a.woe[a.good==0]=0

data_iv=calciv(a)

print(data_iv)

'''io1=r'e:\工作\專項\白騎士資料驗證\woe.xlsx'

writer = pd.excelwriter(io1)

calcwoe(yinka_bqs,"var1","flag_7").to_excel(writer,sheet_name='var1')

calcwoe(yinka_bqs,"var2","flag_7").to_excel(writer,sheet_name='var2')

calcwoe(yinka_bqs,"var3","flag_7").to_excel(writer,sheet_name='var3')

calcwoe(yinka_bqs,"var4","flag_7").to_excel(writer,sheet_name='var4')

calcwoe(yinka_bqs,"var5","flag_7").to_excel(writer,sheet_name='var5')

calcwoe(yinka_bqs,"var6","flag_7").to_excel(writer,sheet_name='var6')

calcwoe(yinka_bqs,"var7","flag_7").to_excel(writer,sheet_name='var7')

calcwoe(yinka_bqs,"var8","flag_7").to_excel(writer,sheet_name='var8')

calcwoe(yinka_bqs,"var9","flag_7").to_excel(writer,sheet_name='var9')

calcwoe(yinka_bqs,"var10","flag_7").to_excel(writer,sheet_name='var10')

calcwoe(yinka_bqs,"var11","flag_7").to_excel(writer,sheet_name='var11')

calcwoe(yinka_bqs,"var12","flag_7").to_excel(writer,sheet_name='var12')

writer.close()

''''''

#驗證iv值

def calcwoe(dataset,col,targe):

#dataset,col,targe=yinka_bqs,'var1','flag_7'

a=dataset[[col,targe]].dropna()

subdata=a.groupby(pd.cut(a[col].dropna(),3))[col].count()

suby=a.groupby(pd.cut(a[col].dropna(),3))[targe].sum()

data=pd.merge(subdata,suby,how="left",left_index=true,right_index=true)

b_total=data[targe].sum()

total=data[col].sum()

g_total=total-b_total

return data

'''

python計算iv值 python計算IV值

1.基於jupyter notebook 導包import numpy as np import math import pandas as pd from sklearn.utils.multiclass import type of target from scipy import stats ...

python計算iv值 python計算IV值

1.基於jupyter notebook 導包import numpy as np import math import pandas as pd from sklearn.utils.multiclass import type of target from scipy import stats ...

Python 批量計算變數iv值

import pandas as pd import numpy as np from sklearn.tree import decisiontreeclassifier data pd.read excel r e lll 20200311人工客群分布 sx all data 0311.xlsx...