利用K 均值聚類演算法對未標註資料分組

2021-07-13 11:57:34 字數 4150 閱讀 7890

kmean.py

#!/usr/bin/python  

# -*- coding: utf-8 -*-

#coding=utf-8

from numpy import *

#匯入資料

defloaddataset

(filename):

datamat =

fr = open(filename)

for line in fr.readlines():

curline = line.strip().split('\t')

fltline = map(float, curline)

return datamat

#計算兩個向量的歐氏距離

defdisteclud

(veca, vecb):

return sqrt(sum(power(veca-vecb, 2)))

#為給定資料集構建乙個包含k個隨機質心的集合

defrandcent

(dataset, k):

n = shape(dataset)[1]

centroids = mat(zeros((k, n)))

for j in range(n):

minj = min(dataset[:, j])

randj = float(max(dataset[:,j] - minj))

#random.rand(k, 1)生成k個[0,1]中的隨機數

centroids[:, j] = minj + randj * random.rand(k, 1)

return centroids

#k均值聚類演算法

defkmeans

(dataset, k, distmeas=disteclud, createcent=randcent):

m = shape(dataset)[0] #資料點總數

clusterassment = mat(zeros((m, 2))) #簇分配結果矩陣,一列記錄簇索引值,一列儲存誤差

centroids =createcent(dataset, k)

clusterchanged = true

#按 計算質心-分配-重新計算 反覆迭代

while clusterchanged:

clusterchanged = false

for i in range(m):

mindist = inf

minindex = -1

#尋找最近的質心

for j in range(k):

distji = distmeas(centroids[j,:], dataset[i,:])

if distji < mindist:

mindist = distji

minindex = j

if clusterassment[i,0] != minindex:

clusterchanged = true

clusterassment[i,:] = minindex, mindist**2

#更新質心的位置

for cent in range(k):

ptsinclust = dataset[nonzero(clusterassment[:,0].a==cent)[0]]

centroids[cent,:] = mean(ptsinclust, axis=0)

return centroids, clusterassment

#二分k-均值聚類演算法

defbikeans

(dataset, k, distmeas=disteclud):

m = shape(dataset)[0]

clusterassment = mat(zeros((m, 2))) #第一例儲存簇分配結果,第二列儲存平方誤差

centroid0 = mean(dataset, axis=0).tolist()[0] #建立乙個初始簇

cenlist = [centroid0] #儲存所有質心

for j in range(m):

clusterassment[j,1] = distmeas(mat(centroid0), dataset[j,:]) ** 2

while (len(cenlist) < k):

lowestsse = inf

for i in range(len(cenlist)):

ptsincurrcluster = dataset[nonzero(clusterassment[:,0].a==i)[0],:] #將簇中的所有點看成乙個小的資料集

centroidmat, splitclustass = kmeans(ptsincurrcluster, 2, distmeas) #生成2個質心簇

ssesplit = sum(splitclustass[:,1]) #誤差和

ssenotsplit = sum(clusterassment[nonzero(clusterassment[:,0].a != i)[0], 1]) #剩餘資料集誤差和

if (ssesplit + ssenotsplit) < lowestsse:

bestcenttosplit = i

bestnewcents = centroidmat

bestclusass = splitclustass.copy()

lowestsse = ssesplit + ssenotsplit

#將要劃分的簇中所有點的簇分配結果進行修改

bestclusass[nonzero(bestclusass[:,0].a==0)[0], 0] = bestcenttosplit #將編號為0的簇改為劃分簇的編號

bestclusass[nonzero(bestclusass[:,0].a==1)[0], 0] = len(cenlist) #將編號為1的簇改為新增簇的編號

print

'the bestcenttosplit is: ', bestcenttosplit

print

'the len of bestclustass is: ', len(bestclusass)

cenlist[bestcenttosplit] = bestnewcents[0,:].tolist()[0] #修改質心列表

clusterassment[nonzero(clusterassment[:,0].a == bestcenttosplit)[0],:] = bestclusass

return mat(cenlist), clusterassment

測試

>>> import kmean

>>> datamat = mat(loaddataset('testset2.txt'))

>>> centlist, myassments = bikeans(datamat, 3)

the bestcenttosplit is: 0

the len of bestclustass is: 60

the bestcenttosplit is: 0

the len of bestclustass is: 40

>>> centlist

matrix([[-2.94737575, 3.3263781 ],

[-0.45965615, -2.7782156 ],

[ 2.93386365, 3.12782785]])

>>> myassments

matrix([[ 2.00000000e+00, 1.45461050e-01],

[ 0.00000000e+00, 6.80213825e-01],

[ 1.00000000e+00, 1.02184582e+00],

[ 2.00000000e+00, 1.34548760e+00],

[ 0.00000000e+00, 1.35376464e+00],

[ 1.00000000e+00, 3.87167519e+00],

[ 2.00000000e+00, 8.37259951e-01],

[ 0.00000000e+00, 2.20116272e-01],

......

機器學習 利用K 均值聚類演算法對未標註資料分組

聚類是一種無監督的學習,它將相似的物件歸到同乙個簇中。有點像全自動分類。聚類方法幾乎可以應用於所有物件,簇內的物件越相似,聚類的效果越好。聚類分析試圖將相似物件歸入同一簇,將不相似物件歸到不同簇。相似這一概念取決於所選的相似度計算方法。優點 易於實現。缺點 可能收斂到區域性最小值,在大規模資料集上收...

利用K means聚類演算法對未標註資料分組

def loaddataset filename datamat assume last column is target value fr open filename for line in fr.readlines curline line.strip split t fltline map f...

k均值聚類演算法

輸入 簇的數目k和包含n個物件的資料庫。輸出 k個簇,使平方誤差準則最小。演算法步驟 1.為每個聚類確定乙個初始聚類中心,這樣就有k 個初始聚類中心。2.將樣本集中的樣本按照最小距離原則分配到最鄰近聚類 3.使用每個聚類中的樣本均值作為新的聚類中心。4.重複步驟2.3直到聚類中心不再變化。5.結束,...