Python實現F1 AUC計算

2021-09-11 17:04:30 字數 3983 閱讀 6363

# -*- coding: utf-8 -*- #

# author neu_lightbulb-----zhangj

import numpy as np

import pandas as pd

class score():

def __init__(self,pre_score,rel_label,threshold,beta):

self.tn = 0

self.fn = 0

self.fp = 0

self.tp = 0

self.pre_score = pre_score

self.rel_label = rel_label

self.threshold = threshold

self.beta = beta

list(map(self.__getcm_count,

self.pre_score,

self.rel_label))

def __getcm(self,pre, rel):

if (pre < self.threshold):

if (rel == 0): return 'tn'

if (rel == 1): return 'fn'

if (pre >= self.threshold):

if (rel == 0): return 'fp'

if (rel == 1): return 'tp'

def get_cm(self):

return list(map(self.__getcm,

self.pre_score,

self.rel_label))

def __getcm_count(self,pre, rel):

if (pre < self.threshold):

if (rel == 0): self.tn += 1

if (rel == 1): self.fn += 1

if (pre >= self.threshold):

if (rel == 0): self.fp += 1

if (rel == 1): self.tp += 1

def get_f1(self):

p = self.tp/(self.tp+self.fp)

r = self.tp/(self.tp+self.fn)

if(p == 0.0):

return 0.0

else:

return (self.beta*self.beta+1)*p*r/(self.beta*self.beta*p+r)

# 方法二 precision——分數精度

def get_auc_by_count(self,precision=100):

# 正樣本數

postive_len = sum(self.rel_label)

# 負樣本數

negative_len = len(self.rel_label) - postive_len

# 總對比數

total_case = postive_len * negative_len

# 正樣本分數計數器(填0在range...)

pos_histogram = [0 for _ in range(precision+1)]

# 負樣本分數計數器(填0在range...)

neg_histogram = [0 for _ in range(precision+1)]

# 分數放大

bin_width = 1.0 / precision

for i in range(len(self.rel_label)):

nth_bin = int(self.pre_score[i] / bin_width)

if self.rel_label[i] == 1:

pos_histogram[nth_bin] += 1

else:

neg_histogram[nth_bin] += 1

accumulated_neg = 0

satisfied_pair = 0

for i in range(precision+1):

satisfied_pair += (pos_histogram[i] * accumulated_neg + pos_histogram[i] * neg_histogram[i] * 0.5)

accumulated_neg += neg_histogram[i]

return satisfied_pair / float(total_case)

# 方法三

def get_auc_by_rank(self):

# 拼接排序

df = pd.dataframe()

df = df.sort_values(by='pre_score',ascending=false).reset_index(drop=true)

# 獲取 n,n,m

n = len(df)

m = len(df[df['rel_label']==1])

n = n - m

# 初始化rank 和同值統計ank_tmp,count_all,count_p

rank = 0.0

rank_tmp,count_all,count_p = 0.0,0,0

# 新增防止越界的一條不影響結果的記錄

df.loc[n] = [0,0]

# 遍歷一次

for i in range(n):

# 判斷i+1是否與i同值,不同值則要考慮是否剛剛結束同值統計

if(df['pre_score'][i+1] != df['pre_score'][i]):

# 正樣本

if(df['rel_label'][i] == 1):

# 計數不為0,剛剛結束同值統計

if (count_all != 0):

# 同值統計結果加在rank上,這裡注意補回結束統計時漏掉的最後一條同值資料

rank += (rank_tmp + n - i) * (count_p+1) / (count_all+1)

rank_tmp, count_all, count_p = 0.0, 0, 0

continue

rank += (n-i)

else:

if (count_all != 0):

rank += (rank_tmp + n - i) * (count_p) / (count_all+1)

rank_tmp, count_all, count_p = 0.0, 0, 0

continue

else:

rank_tmp += (n-i)

count_all += 1

if(df['rel_label'][i] == 1):

count_p += 1

return (rank-m*(1+m)/2)/(m*n)

if __name__ == '__main__':

learn_data_l2 = [0.2,0.3,0.4,0.35,0.6,0.55,0.2,0.57,0.3,0.15,0.77,0.33,0.9,0.49, 0.45,0.41, 0.66,0.43,0.7,0.4]

learn_data_r2 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]

learn_data2 = pd.dataframe()

score2 = score(learn_data2['learn'], learn_data2['real'], 0.5, 1)

print(score2.get_cm())

print(score2.get_f1())

print(score2.get_auc_by_count())

print(score2.get_auc_by_rank())

python計算auc的方法

1 安裝scikit learn 1.1 scikit learn 依賴 分別檢視上述三個依賴的版本 python v 結果 python 2.7.3 python c import scipy print scipy.version.version scipy版本結果 0.9.0 python c...

python計算auc指標例項

1 安裝scikit learn 1.1scikit learn 依賴 python 2.6 or 3.3 numpy 1.6.1 scipy 0.9 分別檢視上述三個依賴的版本,python v 結果 python 2.7.3 python c import scipy print scipy.v...

使用R和Python計算AUC

某日重新灑下的分割線,無奈的我又用回了python 原因有兩個,第一python用了好久了,不想再去用r了,雖然r的ggplot畫圖很好看,不過今天安裝了python的ggplot庫 好激動!第二,也是r的一大缺憾,就是迴圈簡直是慢得離譜。所以又用回了python,於是,就硬著頭皮來分析之前的程式裡...