在信贷领域AUC&KS指标能提现模型的学习效果,但是在评估模型的相对好坏,以及制定使用方案的时候还是需要通过率&逾期率曲线进行评估模型的。
横坐标为累计通过率,纵坐标为累计逾期率,此时比较相同的通过率情况下,逾期率越高,曲线位置就越靠近上方。
1、可以理解曲线下的面积越小越好,跟AUC曲线是相反的逻辑。
2、要看细节部分,在某个局部区间通过率下是否局部更好,是否可以交叉使用
3、是否整个曲线都是缠绕的,如果是的话,考虑选择AUC评估指标吧。
- # 简化版的通过率逾期率
- def get_pass_and_overdue_with_list(pred_result_and_real_label: List[Tuple[float, float]]) -> Tuple[List, List]:
- """
- 输入预测概率和实际label,输出通过率和逾期率序列
- pred_result_and_real_label = data[['ruleset_111_mexico_loan_xgboost_score','first1_overdue3']].values.tolist()
- :param pred_result_and_real_label: 包含预测概率和实际label的元组组成的列表,注意元组的第一个元素为预测概率
- :return: 通过率和逾期率序列
- """
- # 按照元组中的第一个元素,也就是预测值进行升序排列,可变对象已更改
- pred_result_and_real_label.sort(key=lambda x: x[0])
-
- # 获取序列的长度
- the_length = len(pred_result_and_real_label)
- print(f"the_length {the_length} ")
- # pass_rate
- pass_rate = []
- # overdue_rate
- overdue_rate = []
-
- # 遍历过程中每个节点的总逾期数目
- overdue_sum = 0.0
-
- for i in range(the_length):
- # 实时计算通过率并保存
- pass_rate.append((i + 1.0) / the_length)
- # 实时计算总逾期数目
- overdue_sum += pred_result_and_real_label[i][1]
- # 使用实时计算的总逾期数目计算当前在总样本上的逾期率并保存
- overdue_rate.append(overdue_sum / (i + 1.0))
- return pass_rate, overdue_rate
-
-
- def get_auc_ks(data, prob_col='prob', label_col='overdue'):
- """
- :param data:
- :param prob_col:
- :param label_col:
- :return:
- """
- from sklearn.metrics import roc_curve, auc
- fpr, tpr, thresholds = roc_curve(data[label_col], data[prob_col])
- auc_value = auc(fpr, tpr)
- ks = max(tpr - fpr)
- return auc_value, ks
-
-
- def get_pass_and_overdue_by_dataframe(data, prob_col='prob', label_col='overdue', float_round=5):
- """
- :param data:
- :param prob_col:
- :param label_col:
- :param float_round:
- :return:
- """
- from tqdm import tqdm
- data[prob_col] = data[prob_col].apply(lambda x: float(str(x)[0:float_round]))
- data.sort_values(by=prob_col, ascending=True, inplace=True)
-
- data['flag'] = 1
- data_agg = data.groupby(prob_col).agg({'flag': 'sum', label_col: 'sum'})
- data_agg.reset_index(drop=False, inplace=True)
- probs = list(set(data_agg[prob_col]))
- probs.sort()
- ttl_cnt = data.shape[0]
-
- pass_rats = []
- overdue_rats = []
- for prob_ in tqdm(probs):
- pass_rat = data_agg[data_agg[prob_col] <= prob_]['flag'].sum() / ttl_cnt
- pass_rats.append(pass_rat)
-
- cum_overdue = data_agg[data_agg[prob_col] <= prob_][label_col].sum()
- overdue_rat = cum_overdue / data[data[prob_col] <= prob_].shape[0]
- overdue_rats.append(overdue_rat)
- return pass_rats, overdue_rats
-
- def get_auc_ks_pass_overdue_rat(data, prob_col='prob', label_col='overdue', float_round=5, plot=False):
- """
- :param data:
- :param prob_col:
- :param label_col:
- :param float_round:
- :param plot:
- :return:
- """
- print(f'get_auc_ks_pass_overdue_rat prob_col:{prob_col} label_col:{label_col}')
- data = data[[prob_col, label_col]].copy()
- num = data.shape[0]
- auc_value, ks = get_auc_ks(data, prob_col=prob_col, label_col=label_col)
- pass_rats, overdue_rats = get_pass_and_overdue_by_dataframe(data, prob_col=prob_col, label_col=label_col,
- float_round=float_round)
- if plot == True:
- plt.figure(figsize=(6, 6))
- # 开始画图 modes_metrics oot_dstx_num,oot_zcfl_num
- plt.title(f'{label_col}|num:{num}')
- plt.plot(pass_rats, overdue_rats, color='green', label=f'auc:{auc_value:.4f}_ks:{ks:.4f}')
- plt.legend() # 显示图例
- plt.xlabel('pass_rat')
- plt.ylabel('cum_overdue')
- plt.grid()
- plt.show()
- return auc_value, ks, pass_rats, overdue_rats, num
-
-
- def get_auc_ks_pass_overdue_rat_with_data_type(train_data
- , data_type='data_type'
- , dstx='dstx'
- , zcfl='zcfl'
- , prob_col='prob'
- , label_col='overdue'
- , float_round=5
- , plot=False):
- """
- 计算正常分流&大赦天下的模型指标
- :param train_data: dataframe
- :param data_type: dstx&zcfl
- :param dstx: dstx的标记
- :param zcfl: zcfl的标记
- :param prob_col: 预测概率值列
- :param label_col: 标签列
- :param float_round: 保留精度
- :param plot: 是否绘制通过预期曲线
- :return:
- """
- data = train_data[train_data[data_type] == dstx][[prob_col, label_col]].copy()
- oot_dstx_auc_value, oot_dstx_ks, oot_dstx_pass_rats, oot_dstx_overdue_rats, oot_dstx_num = get_auc_ks_pass_overdue_rat(
- data\
- , prob_col=prob_col\
- , label_col=label_col\
- , float_round=5\
- , plot=False)
-
- data = train_data[train_data[data_type] == zcfl][[prob_col, label_col]].copy()
- oot_zcfl_auc_value, oot_zcfl_ks, oot_zcfl_pass_rats, oot_zcfl_overdue_rats, oot_zcfl_num = get_auc_ks_pass_overdue_rat(
- data \
- , prob_col=prob_col \
- , label_col=label_col \
- , float_round=5 \
- , plot=False)
-
- if plot:
- fig, axs = plt.subplots(2, 1)
- plt.rcParams['figure.figsize'] = (9, 12.0)
- axs[0].plot(oot_dstx_pass_rats, oot_dstx_overdue_rats)
- axs[0].set_ylim(0, 0.5)
- axs[0].set_xlim(0, 1)
- axs[0].set_title(f'oot dstx nocv fpd3|num:{oot_dstx_num}')
- axs[0].set_xlabel('pass_rat')
- axs[0].set_ylabel('cum_overdue') # ,fontproperties = font
- axs[0].grid(True)
- axs[0].legend(["auc:%.4f ks:%.4f" % (oot_dstx_auc_value, oot_dstx_ks)], loc="lower left")
-
- axs[1].plot(oot_zcfl_pass_rats, oot_zcfl_overdue_rats)
- axs[1].set_ylim(0, 0.5)
- axs[1].set_xlim(0, 1)
- axs[1].set_title(f'oot zcfl nocv fpd3|num:{oot_zcfl_num}')
-
- axs[1].set_xlabel('pass_rat')
- axs[1].set_ylabel('cum_overdue') # ,fontproperties = font
- axs[1].grid(True)
-
- axs[1].legend(["auc:%.4f ks:%.4f" % (oot_zcfl_auc_value, oot_zcfl_ks)], loc="lower left")
-
- return oot_dstx_auc_value, oot_dstx_ks, oot_dstx_pass_rats, oot_dstx_overdue_rats, oot_dstx_num, \
- oot_zcfl_auc_value, oot_zcfl_ks, oot_zcfl_pass_rats, oot_zcfl_overdue_rats, oot_zcfl_num
def plot_dstx_zcfl_pass_overdue(modes_metrics, label='fpd3', line_desc=None,
colors=['skyblue', 'green', 'blue', 'y', 'r']):
"""
一张图绘制两个子图,分别绘制zcfl & dstx的通过率逾期率
:param modes_metrics: each row contains oot_dstx_auc_value, oot_dstx_ks, oot_dstx_pass_rats, oot_dstx_overdue_rats,
oot_dstx_num, oot_zcfl_auc_value, oot_zcfl_ks, oot_zcfl_pass_rats, oot_zcfl_overdue_rats, oot_zcfl_num
:param label: overdue label
:param line_desc: 图例描述
:param colors: 图例颜色
:return: None
"""
import matplotlib.font_manager as fm
font_size=12
# 设置family、size
font = fm.FontProperties(fname='/data/simhei.ttf', size=font_size)
plt.rcParams['figure.figsize'] = (9, 12.0)
fig, axs = plt.subplots(2, 1)
if line_desc is None:
line_desc = ['']*len(modes_metrics)
for i in range(len(modes_metrics)):
oot_dstx_auc_value, oot_dstx_ks, oot_dstx_pass_rats, oot_dstx_overdue_rats, oot_dstx_num = modes_metrics[i][0:5]
axs[0].plot(oot_dstx_pass_rats, oot_dstx_overdue_rats, color=colors[i],
label=f'auc:{oot_dstx_auc_value:.4f}|ks:{oot_dstx_ks:.4f}_{line_desc[i]}' )
axs[0].set_ylim(0, 0.4)
axs[0].set_xlim(0, 1)
axs[0].set_title(f'oot dstx nocv {label}|num:{oot_dstx_num}')
axs[0].set_xlabel('pass_rat')
axs[0].set_ylabel('cum_overdue', fontproperties=font)
axs[0].legend(loc="lower right",prop=font) # 显示图例
axs[0].grid(True)for i in range(len(modes_metrics)):
oot_dstx_auc_value, oot_dstx_ks, oot_dstx_pass_rats, oot_dstx_overdue_rats, oot_dstx_num = modes_metrics[i][5:]
axs[1].plot(oot_dstx_pass_rats, oot_dstx_overdue_rats, color=colors[i],
label=f'auc:{oot_dstx_auc_value:.4f}|ks:{oot_dstx_ks:.4f}_{line_desc[i]}' )
axs[1].set_ylim(0, 0.4)
axs[1].set_xlim(0, 1)
axs[1].set_title(f'oot zcfl nocv {label}|num:{oot_dstx_num}')
axs[1].set_xlabel('pass_rat')
axs[1].set_ylabel('cum_overdue' , fontproperties = font)
axs[1].legend(loc="lower right",prop=font) # 显示图例
axs[1].grid(True)
绘图结果如上图