python d rs normal

编程入门 行业动态 更新时间:2024-10-26 21:21:30

<a href=https://www.elefans.com/category/jswz/34/1770869.html style=python d rs normal"/>

python d rs normal

1、项目涉及到的一些函数

# -*- coding: utf-8 -*-

# %%time

# from pyhive import presto

import pandas as pd

import numpy as np

import warnings

import os

from pyhive import presto

import matplotlib.pyplot as plt

import sys

from tqdm import tqdm

from sklearn.externals import joblib

from joblib import Parallel,delayed

import scorecardpy as sc

import toad

import datetime, calendar

import time

from datetime import timedelta, date

from time import *

import toad

import pydotplus

from IPython.display import Image

from sklearn.externals.six import StringIO

from sklearn import tree

from pandas import DataFrame

from sklearn.tree import _tree

from functools import reduce

%matplotlib inline

#透视表功能,agg里面写函数就是按函数统计,不写就是统计values的占比,agg内可以选,count,min,max,sum,mean,len

#按照列为index,行为columns进行透视展示values内对象的分布情况,空值填充0

# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count'])

#设置行,列,列宽等

pd.set_option('display.max_rows',None)

pd.set_option('display.max_columns',None)

pd.set_option('max_colwidth', 500)

cursor=presto.connect('IP',端口,'jobschedule').cursor()

def read_sql(hql):

cursor.execute(hql)

try:

result = cursor.fetchall()

num_columns = len(cursor.description)

columns_names = [i[0] for i in cursor.description]

except Exception as e:

print(num_columns)

print(columns_names)

data = pd.DataFrame(list(result),columns=columns_names)

return data

####################### PlotKS ##########################

def PlotKS(preds, labels, n=20, asc=True):

# preds is score: asc=1

# preds is prob: asc=0

pred = preds # 预测值

bad = labels # 取1为bad, 0为good

ksds = pd.DataFrame({'bad': bad, 'pred': pred})

ksds['good'] = 1 - ksds.bad

if asc == 1:

ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, True])

elif asc == 0:

ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, True])

ksds1.index = range(len(ksds1.pred))

ksds1['cumsum_good1'] = 1.0*ksds1.good.cumsum()/sum(ksds1.good)

ksds1['cumsum_bad1'] = 1.0*ksds1.bad.cumsum()/sum(ksds1.bad)

if asc == 1:

ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, False])

elif asc == 0:

ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, False])

ksds2.index = range(len(ksds2.pred))

ksds2['cumsum_good2'] = 1.0*ksds2.good.cumsum()/sum(ksds2.good)

ksds2['cumsum_bad2'] = 1.0*ksds2.bad.cumsum()/sum(ksds2.bad)

# ksds1 ksds2 -> average

ksds = ksds1[['cumsum_good1', 'cumsum_bad1']]

ksds['cumsum_good2'] = ksds2['cumsum_good2']

ksds['cumsum_bad2'] = ksds2['cumsum_bad2']

ksds['cumsum_good'] = (ksds['cumsum_good1'] + ksds['cumsum_good2'])/2

ksds['cumsum_bad'] = (ksds['cumsum_bad1'] + ksds['cumsum_bad2'])/2

# ks

ksds['ks'] = ksds['cumsum_bad'] - ksds['cumsum_good']

ksds['tile0'] = range(1, len(ksds.ks) + 1)

ksds['tile'] = 1.0*ksds['tile0']/len(ksds['tile0'])

qe = list(np.arange(0, 1, 1.0/n))

qe.append(1)

qe = qe[1:]

ks_index = pd.Series(ksds.index)

ks_index = ks_index.quantile(q = qe)

ks_index = np.ceil(ks_index).astype(int)

ks_index = list(ks_index)

ksds = ksds.loc[ks_index]

ksds = ksds[['tile', 'cumsum_good', 'cumsum_bad', 'ks']]

ksds0 = np.array([[0, 0, 0, 0]])

ksds = np.concatenate([ksds0, ksds], axis=0)

ksds = pd.DataFrame(ksds, columns=['tile', 'cumsum_good', 'cumsum_bad', 'ks'])

ks_value =abs(ksds.ks).max() #20200502-调整增加abs

ks_pop = ksds.tile[ksds.ks.idxmax()]

print ('ks_value is ' + str(np.round(ks_value, 4)) + ' at pop = ' + str(np.round(ks_pop, 4)))

# chart

plt.plot(ksds.tile, ksds.cumsum_good, label='cum_good',

color='blue', linestyle='-', linewidth=2)

plt.plot(ksds.tile, ksds.cumsum_bad, label='cum_bad',

color='red', linestyle='-', linewidth=2)

plt.plot(ksds.tile, ksds.ks, label='ks',

color='green', linestyle='-', linewidth=2)

plt.axvline(ks_pop, color='gray', linestyle='--')

plt.axhline(ks_value, color='green', linestyle='--')

plt.axhline(ksds.loc[ksds.ks.idxmax(), 'cumsum_good'], color='blue', linestyle='--')

plt.axhline(ksds.loc[ksds.ks.idxmax(),'cumsum_bad'], color='red', linestyle='--')

plt.title('KS=%s ' %np.round(ks_value, 4) +

'at Pop=%s' %np.round(ks_pop, 4), fontsize=15)

plt.legend()

return ksds

#逾期打标

def fun(x):

if x >= 30:

return 1

else:

return 0

#灰用户打标

def Grey(x):

if (x > 0)&(x<30):

return 1

else:

return 0

#返回日期的年份月份

def getYearMonth(x):

return str(x)[0:7]

#计算分组后的区间用户占比

def ration(x):

return x.sum()/x.count()

class rule_tree(object):

def __init__(self, datasets, ex_lis, dep='bad_ind', min_samples=0.05, min_samples_leaf=200, min_samples_split=50,

max_depth=3 ):

'''

目前规则变量只支持数值型

datasets:数据集 dataframe格式

ex_lis:不参与建模的特征,如id,时间切片等。 list格式

min_samples:分箱时最小箱的样本占总比 numeric格式

max_depth:决策树最大深度 numeric格式

min_samples_leaf:决策树子节点最小样本个数 numeric格式

min_samples_split:决策树划分前,父节点最小样本个数 numeric格式

'''

self.datasets = datasets

self.ex_lis = ex_lis

self.dep = dep

self.max_depth = max_depth

self.min_samples = min_samples

self.min_samples_leaf = min_samples_leaf

self.min_samples_split = min_samples_split

def tree_to_code(self, tree, feature_names):

tree_ = tree.tree_

feature_name = [feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature]

print ("def tree({}):".format(", ".join(feature_names)))

p_value = [i[0][0] for i in dtree.tree_.value]

def recurse(node, depth):

indent = " " * depth

if tree_.feature[node] != _tree.TREE_UNDEFINED:

name = feature_name[node]

threshold = tree_.threshold[node]

print ("{}if {} <= {}:".format(indent, name, threshold))

recurse(tree_.children_left[node], depth + 1)

print ("{}if {} > {}".format(indent, name, threshold))

#print ("{}else:".format(indent))

recurse(tree_.children_right[node], depth + 1)

else:

print ("{}return {}".format(indent, tree_.value[node]))

recurse(0, 1)

def get_lineage(self, tree, feature_names):

left = tree.tree_.children_left

right = tree.tree_.children_right

threshold = tree.tree_.threshold

features = [feature_names[i] for i in tree.tree_.feature]

# get ids of child nodes

idx = np.argwhere(left == -1)[:,0]

def recurse(left, right, child, lineage=None):

if lineage is None:

lineage = [child]

if child in left:

parent = np.where(left == child)[0].item()

split = 'l'

else:

parent = np.where(right == child)[0].item()

split = 'r'

lineage.append((parent, split, threshold[parent], features[parent]))

if parent == 0:

lineage.reverse()

return lineage

else:

return recurse(left, right, parent, lineage)

node_list = []

for child in idx:

for node in recurse(left, right, child):

#print(node)

node_list.append(node)

return node_list

def get_interval_node(self, node_list):

node_dict = {}

node_id = 1

for id,i in enumerate(node_list[:-1]):

if id==0:

node_dict[node_id] = [i[1:]]

if id>0:

if type(node_list[id-1])==tuple:

if type(i)==tuple:

node_dict[node_id].append(i[1:])

else:

node_id += 1

node_dict[node_id] = []

else:

node_dict[node_id].append(i[1:])

return node_dict

def rules(self,tree,feature_names):

node_list = self.get_lineage(tree,feature_names)

node_dict = self.get_interval_node(node_list)

rules_dict = {}

con_cal = []

for k,v in node_dict.items():

con = []

con_cal_ = []

for v_ in v:

con.append("({} < {})".format(v_[2],v_[1]) if v_[0]=='l' else "({} >= {})".format(v_[2],v_[1]))

con_cal_.append((self.datasets[v_[2]] < v_[1]) if v_[0]=='l' else (self.datasets[v_[2]] >= v_[1]))

con_cal.append(reduce(lambda x, y: x*y,con_cal_))

rules_dict[k] = ['&'.join(con),round(self.datasets[con_cal[-1]][self.dep].mean(),4)]

rules = pd.DataFrame(rules_dict).T

rules.columns = ['规则','p']

return con_cal,rules

def rule_indicator(self, d, cons):

indicator = {}

for id,con in enumerate(cons):

reject = np.sum(~con)

fugai=np.sum(con)

total=np.sum(con)+np.sum(~con)

wuju = np.sum((~con)*(d[self.dep]==0))

wufang = d[con][self.dep].sum()

lanjie_hei=total-fugai-wuju

shengyu_bai=fugai-wufang

fugai_lv=round(fugai / total,4)

default_rate = round(wufang / np.sum(con),4)

default_rate = round(wufang / np.sum(con),4)

precise = round((reject - wuju) / reject,4)

recall = round((reject - wuju) / d[self.dep].sum(),4)

distrub = round(np.sum((~con)*(d[self.dep]==0)) / np.sum(d[self.dep]==0),4)

ks = round(recall - distrub,4)

## lift 累计坏/累计总/平均逾期率

lift = round(default_rate /(d[self.dep].mean()),4)

indicator[id+1] = {'总拦截量':reject,'拦截坏样本':lanjie_hei,'拦截白样本':wuju,'通过量':fugai,'误放量':wufang,'误放率(逾期率)':default_rate,'总样本':total,'通过率':fugai_lv,

'准确率':precise,'召回率':recall,'打扰率':distrub,'ks':ks,'lift倍数':lift}

indicator = pd.DataFrame(indicator).T

indicator[['总拦截量','拦截坏样本','拦截白样本','通过量','误放量','总样本']] = indicator[['总拦截量','拦截坏样本','拦截白样本','通过量','误放量','总样本']].astype(int)

return indicator

def fit_plot(self):

# DecisionTreeClassifier 可分类算法

dtree = tree.DecisionTreeRegressor(max_depth=self.max_depth,

min_samples_leaf=self.min_samples_leaf,

min_samples_split=self.min_samples_split)

x = self.datasets.drop(self.ex_lis, axis=1)

y = self.datasets[self.dep]

dtree = dtree.fit(x, y)

with open("dt.dot", "w") as f:

tree.export_graphviz(dtree, out_file=f)

dot_data = StringIO()

tree.export_graphviz(dtree, out_file=dot_data,

feature_names=x.columns,

class_names=[self.dep],

filled=True, rounded=True,

special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

#self.tree_to_code(dtree,x.columns)

print('\n\n')

con_cal,rules = self.rules(dtree, x.columns)

indicator = self.rule_indicator(self.datasets, con_cal)

cols = ['规则','总拦截量','拦截坏样本','拦截白样本','通过量','误放量','总样本','通过率', '误放率(逾期率)','准确率', '召回率', '打扰率', 'ks', 'lift倍数']

rule = rules.merge(indicator,left_index=True,right_index=True)[cols]

rule.sort_values('误放率(逾期率)',ascending=True,inplace=True)

return rule,graph.create_png()

def evaluate_func1(df,cols=None,groups=None,label='y',descs=None):

'''

注意:这里注意非空样本和空样本是分开计算的

df:输入的数据,dataframe

cols:指定需要评估的列,列表

groups:指定需要分组的列,列表,注意分组的列值最好没有空值(自行填充一下)

label:指定标签的列名(标签中1黑,0白)

dropna:计算时是否去掉空值的行

descs:需降序分箱累计的列,默认全部按升序分箱累计,即低分数分箱往上累计加到高分数分箱上

return:返回dataframe,列名如下

'group_set':分组

'var_name':'变量名'

'flag':'拦截区间'

'total':样本数

'var_empty':空值个数

'var_get':变量覆盖率(非空)

'stp':'拦截样本数'

'pos':'黑样本数'

'neg':'白样本数'

'cunsum_pos':'累计黑样本数'

'cunsum_neg':'累计白样本数'

'cunsum_stp':'累计拦截数'

'total_pos':总体黑样本数

'total_neg':总体白样本数

'total_stp':总体样本数

'intercept':'拦截率'

'precision':'准确率'

'recall':'召回率'

'disturb':'打扰率'

'cum_precision':'累计准确率'

'avg_precision':'平均准确率'

'cum_recall':'累计召回率'

'cum_disturb':'累计打扰率'

'bks':'ks区间值'

'ks':'ks值'

'''

df=df.apply(pd.to_numeric,errors='ignore')

df[df.select_dtypes('object').columns.tolist()]=df.select_dtypes('object').apply(pd.to_datetime,errors='ignore')

from tqdm import tqdm

# 计算指标

def define_mertrics(df,label,descs=None):

def __num_q(ser,dropna):

num=ser.nunique(dropna)

if num>25:

q=10

elif num>15:

q=8

elif num>=10:

q=6

else:

q=num

return q

dts=df.select_dtypes('datetime64[ns]').shape[1]

objs=df.select_dtypes('object').shape[1]

nums=len(df.columns.tolist())-dts-objs

print('data has {} datetime columns,{} object columns,{} numeric columns'.format(dts,objs,nums))

res=[]

if not descs:

descs=[]

for col in tqdm(df.columns.difference([label])):

na_stp=df[df[col].isna()==True].shape[0]

na_pos=df[df[col].isna()==True][label].sum()

na_neg=na_stp-na_pos

var_get=df.shape[0]-na_stp

total=df.shape[0]

ser=df[df[col].isna()==False][col]

tmp=pd.DataFrame()

# 数值型且取值个数大于10

if ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()>=10:

q=__num_q(ser,dropna=True)

_,bins=pd.qcut(ser,q=q,retbins=True,precision=2,duplicates='drop',labels=False)

bins[0]=bins[0]-0.01

cuts=pd.cut(ser,bins=bins)

tmp['flag']=cuts # 分箱

tmp['var_name']= col # 变量

tmp['label']=df[label] # 标签

#数值型且取值个数小于10

elif ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()<10:

tmp['flag']=ser

tmp['var_name']=col

tmp['label']=df[label]

#非数值型(类别型)

else:

tmp['flag']=ser

tmp['var_name']=col

tmp=tmp.astype(str)

tmp['label']=df[label]

#计算非空样本

result_stp=tmp.groupby(['var_name','flag']).count().rename(columns={'label':'stp'})

result_pos=tmp.groupby(['var_name','flag'])['label'].sum().to_frame().rename(columns={'label':'pos'}).astype(int) #黑样本数

result_neg=tmp[tmp['label']==0].groupby(['var_name','flag'])['label'].count().to_frame().rename(columns={'label':'neg'}).astype(int) # 白样本数目

merge_result=result_stp.merge(result_pos,how='left',on=['var_name','flag']).merge(result_neg,how='left',on=['var_name','flag'])

merge_result.reset_index(inplace=True)

merge_result['rank']=pd.Series([i+1 for i in range(merge_result.shape[0])])

if ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()>=10:

merge_result['flag'] = merge_result['flag'].cat.add_categories([0])

merge_result.fillna(0,inplace=True)

if col in descs:

merge_result.sort_values(by=['rank'],axis=0,ascending=False,inplace=True,na_position='last')

merge_result['cunsum_pos']=merge_result.groupby('var_name')['pos'].cumsum().astype(int) #累计黑样本

merge_result['cunsum_neg']=merge_result.groupby('var_name')['neg'].cumsum().astype(int) #累计白样本

merge_result['cunsum_stp']=merge_result.groupby('var_name')['stp'].cumsum().astype(int) #累计拦截数

merge_result['total_pos']=merge_result['cunsum_pos'].max()

merge_result['total_neg']=merge_result['cunsum_neg'].max()

merge_result['total_stp']=merge_result['cunsum_stp'].max()

#空样本计算

tmpna=pd.DataFrame()

columns=['var_name','flag','stp','pos','neg','rank','cunsum_pos','cunsum_neg','cunsum_stp','total_pos','total_neg','total_stp']

tmpna[columns]=pd.DataFrame(np.array([[col,'empty',na_stp,na_pos,na_neg,0,na_pos,na_neg,na_stp,na_pos,na_neg,na_neg]]),columns=columns)

#合并非空和空样本

merge_result=pd.concat([merge_result,tmpna],axis=0)

merge_result['var_get']=var_get

merge_result['var_empty']=na_stp

merge_result['total']=total

res.append(merge_result)

res=pd.concat(res,axis=0)

res=res.apply(pd.to_numeric,errors='ignore')

# 是否空值单独统计

# 计算指标

res['get_rate']=res['var_get']/(res['total']+0.001) # 变量覆盖率

res['intercept']=res['stp']/(res['total_stp']+0.001) # 拦截率

res['precision']=res['pos']/(res['stp']+0.001) # 准确率

res['recall']=res['pos']/(res['total_pos']+0.001) #召回率

res['disturb']=res['neg']/(res['total_neg']+0.001) #打扰率

res['cum_precision']=res['cunsum_pos']/(res['cunsum_stp']+0.001) # 累计准确率

res['avg_precision']=res['total_pos']/(res['total_stp']+0.001) #均准确率

res['cum_recall']=res['cunsum_pos']/(res['total_pos']+0.001) # 累计召回率

res['cum_disturb']=res['cunsum_neg']/(res['total_neg']+0.001) # 累计打扰率

res['bks']=abs(res['cum_recall']-res['cum_disturb']) # 区间ks值

res=res.merge(res[res['flag']!='empty'].groupby('var_name')['bks'].max().to_frame().reset_index().rename(columns={'bks':'ks'}),how='left',on=['var_name'])

rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重

# 返回列

retcols=['var_name','flag','var_empty','var_get','total','get_rate','stp','pos','neg','cunsum_stp','cunsum_pos','cunsum_neg','total_stp','total_pos','total_neg','intercept','precision','recall','disturb','cum_precision','avg_precision','cum_recall','cum_disturb','bks','ks']

rs=rs[retcols]

# 修改指标名

rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重

rs.rename(columns={'var_name':'变量'

,'flag':'拦截区间'

,'stp':'拦截样本数'

,'pos':'黑样本数'

,'neg':'白样本数'

,'cunsum_pos':'累计黑样本数'

,'cunsum_neg':'累计白样本数'

,'cunsum_stp':'累计拦截数'

,'intercept':'拦截率'

,'precision':'准确率'

,'recall':'召回率'

,'disturb':'打扰率'

,'cum_precision':'累计准确率'

,'avg_precision':'平均准确率'

,'cum_recall':'累计召回率'

,'cum_disturb':'累计打扰率'

,'bks':'ks区间值'

,'ks':'ks值'

},inplace=True)

return rs

# 多组计算计算指标

group_name='group_set'

tmp=[]

if not cols:

cols=df.columns.tolist()

if not groups:

groups=['nan_group_set']

df[groups[0]]='&&&'

df[groups]=df[groups].astype(str)

df[groups]=df[groups].astype(str).fillna('ng')

if len(groups)>1:

df['group_set']=df[groups[0]].str.cat([df[g] for g in groups[1:]],sep='_')

else:

df['group_set']=df[groups[0]]

group_set=df['group_set'].unique()

for k in group_set:

df_new=df[df['group_set']==k][cols+[label]]

res=define_mertrics(df_new,label=label,descs=descs)

columns=res.columns.tolist()

res[group_name]=k

res=res[[group_name]+columns]

tmp.append(res)

outs=pd.concat(tmp,axis=0)

return outs

print('加载完毕,时间:',strftime("%Y-%m-%d %H:%M:%S", localtime()))

2、读取数据

data_changjing=pd.read_csv('*/全域mob3.csv')

#如果一个文件包含多个sheet的读取

data_Tencent_lfq=pd.read_excel('*.xlsx',sheet_name='测试')

#将两个读入的dataframe数据进行拼接,类似excel的vlookup,on中可以多个条件共同使用,也可以只用一个条件

data_new3=pd.merge(data_changjing,data_y,on= ['num_id'])

#将两个数据行拼接

data_new3=data_new3.append(data_new2)

#数据信息描述

data_new3.describe() #数据分布描述

data_new3.info()

data_new3.shape #数据类型,字段,空值等描述

data_new3.dtype #数据类型

data_new3[''].value_counts()#统计列内数据分布

data_new3.rename(columns={'score':'Bscore'},inplace=True) #列修改名

data_new3=data_new3[['A','B','C','D','E']]#如果数据有A-G列,只选几列自己需要的数据

#groupby与聚合函数aggfunc

#groupby内的分组条件,可以是一个,也是是多个,多个时需要存储为list形式,groupby(['bins2']) /groupby([A,B,C])

def ration(x):

return x.sum()/x.count()

res2.groupby(['bins2'])['mob4'].agg(['count','sum',ration])

#agg内可以有count,min,max,mean,sum,或者自己定义

#数据筛选与显示

data1=data_new3[['num_id','fina_date','zhiye']]

data1.head()

数据的等频与等距分箱,并将分箱结果存到新列

#等频分箱

res2['bins']=pd.qcut(res2['score'],10,duplicates='drop',retbins=True)[0]

#等距分箱

res2['bins2']= pd.cut(res2['score'],10,duplicates='drop',retbins=True)[0]

数据透视

# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],aggfunc=['count'],values=['mob4'],fill_value=0,margins=True, dropna=True)

pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count']) #透视表功能,agg里面写函数就是按函数统计,不写就是统计values的占比

pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['mob4'],values=['num_id'],fill_value=0,aggfunc=['count'])

2、dataframe获取数据的列明

data_new3.columns

结果:

Index(['num_id', 'zhiye', 'weiyue', 'gongzhai', 'qingchang', 'zhuxing', 'lvyue', 'shouxin', 'xiaofei', 'xingqu', 'chengzhang', 'overdue'], dtype='object')

3、评分卡Python包的操作使用-scorecardpy

import scorecardpy as sc

#卡方分箱获取分箱结果

bins_new=sc.woebin(data_new3.loc[data_new3.overdue!=-1,['gongzhai','overdue']], y="overdue")

bins_new #分箱结果

#画出分箱结果分布

woebin_plot=sc.woebin_plot(bins_new)

woebin_plot

#分箱结果显示为dataframe形式

bins_new['gongzhai']

#对bins_new进行统计操作

bins_new['gongzhai']['count'].sum()

画图相关的函数调用

plt_data=Score_set_overdue[(Score_set_overdue['order_month_x']=='2019-11')&(Score_set_overdue['class_y']=='lfq')]

ks=PlotKS(plt_data['pred_score'],plt_data['overdue'],n=20,asc=True)

ks

plt.show

ks走势图.png

bins_new=sc.woebin(plt_data.loc[plt_data.overdue!=-1,['pred_score','overdue']], y="overdue")

woebin_plot=sc.woebin_plot(bins_new)

plt.figure(figsize=(5,5))

woebin_plot

qujianyuyuqi.png

多特征交叉与选择

# 读取数据

cross_data=Score_set_overdue[(Score_set_overdue['order_month_x']=='2020-01')&(Score_set_overdue['class_y']=='dae')]

# 指定不参与建模的变量,包含标签bad_ind。

ex_list=cross_data.drop(['Bscore','Tencent_score'],axis=1)

# 调用决策树函数

rules,graph = rule_tree(datasets=cross_data, ex_lis=ex_list,max_depth=2,dep='overdue', min_samples=0.01,

min_samples_leaf=200, min_samples_split=200).fit_plot()

Image(graph)

rules #查看选取规则

12.png

评分卡结果处理

outs_Bscore=evaluate_func1(Bscore_Score_set_overdue[Bscore_Score_set_overdue['class_x']!='None'],cols=['Bscore'],groups=['order_month','class_x'],label='overdue')

outs_Bscore[outs_Bscore['拦截区间']!='empty']

更多推荐

python d rs normal

本文发布于:2024-02-12 00:09:30,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1684502.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:python   rs   normal

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!