機器學習之使用Apriori演算法進行關聯分析

2021-08-03 11:04:59 字數 3530 閱讀 5430

from numpy import *

def loaddataset():

return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

def createc1(dataset):

c1=for transaction in dataset:

for item in transaction:

if not [item] in c1:

c1.sort()

return list(map(frozenset,c1))#frozenset是指被「冰凍 」的集合, 就是說它們是不可改變的,艮口使用者不能修改它們

#資料集ck,包含候選集合的列表以及感興趣項集的最小支援度minsupport

def scand(d,ck,minsupport):

sscnt={}

for tid in d:

for can in ck:

if can.issubset(tid):

if not can in sscnt: sscnt[can]=1

else: sscnt[can]+=1

numitems=float(len(d))

retlist=

supportdata={}

for key in sscnt:

support=sscnt[key]/numitems

if support>=minsupport:

retlist.insert(0,key)

supportdata[key]=support

return retlist,supportdata

#~ dataset=loaddataset()

#~ c1=createc1(dataset)

#~ print(c1)

#~ d=list(map(set,dataset))

#~ l1,suppdata0=scand(d,c1,0.5)

#~ print(l1)

#capriorigen () 的輸人引數為頻繁項集列表lk與項集元素個數k, 輸出為ck

def apriorigen(lk,k):

retlist=

lenlk=len(lk)

for i in range(lenlk):

for j in range(i+1,lenlk):

l1=list(lk[i])[:k-2]

l2=list(lk[j])[:k-2]

l1.sort();l2.sort()

if l1==l2:

return retlist

def apriori(dataset, minsupport = 0.5):

c1 = createc1(dataset)

d = list(map(set, dataset))

l1, supportdata = scand(d, c1, minsupport)

l = [l1]

k = 2

while (len(l[k-2]) > 0):

ck = apriorigen(l[k-2], k)

lk, supk = scand(d, ck, minsupport)#scan db to get lk

supportdata.update(supk)

k += 1

return l, supportdata

#~ dataset=loaddataset()

#~ l,suppdata=apriori(dataset)

#~ print(l)

#~ print(apriorigen(l[0],2))

#函式generaterules()有3個引數:頻繁項集列表、包含那些頻繁項集支援資料的字典、最小可信度閾值

def generaterules(l,supportdata,minconf=0.7):

bigrulelist=

for i in range(1,len(l)):

for freqset in l[i]:

h1=[frozenset([item]) for item in freqset]

if (i > 1):

rulesfromconseq(freqset, h1, supportdata, bigrulelist, minconf)

else:

calcconf(freqset, h1, supportdata, bigrulelist, minconf)

return bigrulelist

def calcconf(freqset, h, supportdata, brl, minconf=0.7):

prunedh = #create new list to return

for conseq in h:

conf = supportdata[freqset]/supportdata[freqset-conseq] #calc confidence

if conf >= minconf:

print(freqset-conseq,'-->',conseq,'conf:',conf)

return prunedh

def rulesfromconseq(freqset, h, supportdata, brl, minconf=0.7):

m = len(h[0])#h中第乙個元素(任意乙個元素)的長度

print('m=',m)

#生成式的右部可能包含兩個或多個元素,如果從集合 開始,那麼h1應該是,[,,,]

#~ 如果頻繁項集的元素數目超過 2 ,那麼會考慮對它做進一步的合併

#~ 右部可能出現,,

if (len(freqset) > (m + 1)): #try further merging

hmp1 = apriorigen(h, m+1)#create hm+1 new candidates

hmp1 = calcconf(freqset, hmp1, supportdata, brl, minconf)

if (len(hmp1) > 1): #need at least two sets to merge

rulesfromconseq(freqset, hmp1, supportdata, brl, minconf)

#~ dataset=loaddataset()

#~ l,suppdata=apriori(dataset,minsupport=0.5)

#~ rules=generaterules(l,suppdata,minconf=0.5)

#~ print(rules)

mushdatset=[line.split() for line in open('mushroom.dat').readlines()]

l,suppdata=apriori(mushdatset,minsupport=0.3)

for item in l[2]:

if item.intersection('2'): print(item)

機器學習之Apriori

1 幾個概念 1 關聯分析 一種在大規模資料中尋找有趣關係的任務。這種有趣關係一般有兩種形式 頻繁項集或者關聯規則。2 頻繁項集 經常,頻繁出現在一起的物品集合,通常用一對 來表示。3 關聯規則 兩種物品之間存在的關聯關係,通常用 4 支援度 這是用來衡量頻繁項集的因子。乙個項集的支援度即為乙個資料...

機器學習演算法 之Apriori

apriori演算法不同於以前接觸過的機器學習演算法,這種演算法用於在資料集中尋找有趣的關係。這些關係可以有兩種形式 頻繁項集或者關聯規則。關於演算法的詳細介紹參見 def apriori dataset,minsupport 0.5 c1 createc1 dataset d map set,da...

機器學習之Apriori演算法python實現

coding utf 8 created on sun dec 23 15 50 25 2018 author muli from future import print function import pandas as pd 自定義連線函式,用於實現l 到c k的連線 def connect s...