Mahout Canopy原始碼分析

2021-06-22 22:43:30 字數 4846 閱讀 6167

1.選擇簡單,計算代價低的方法計算物件相似性,將相似的物件放在乙個canopy子集下通過計算得到若干canopy,canopy之間可以重疊,不存在某個物件不屬於任何canopy。

2.canopy演算法一般做為資料預處理。通過計算得到的幾個簇,從每個簇中選取乙個理中心點最近的點(中心點)作為其他聚類(k-means)的初始中心。

1.包括乙個原始資料集list和乙個空的canopy列表,2個閥值t1,t2 t2

2.從list中取一點p,將p從list中刪除並在canopy列表中新建乙個以p為中心的canopy。

3.用低成本的計算方式計算list中剩餘的點與該canopy的距離,如果距離小於t2,將點從list中刪除,並將點加入到canopy中,如果距離小於t1,將點加入到canopy中.

4.重複2-3步驟,直到list為空。

string t1_key = "org.apache.mahout.clustering.canopy.t1";  //t1

string t2_key = "org.apache.mahout.clustering.canopy.t2"; //t2

string t3_key = "org.apache.mahout.clustering.canopy.t3"; //t3,在reduce中用到的t1,不設等於t1

string t4_key = "org.apache.mahout.clustering.canopy.t4"; //t4,在reduce中用到的t2,不設等於t2

public static final string distance_measure_key = "org.apache.mahout.clustering.canopy.measure";//計算距離的公式

public static final string cf_key = "org.apache.mahout.clustering.canopy.canopyfilter";//乙個閥值,canopy中的向量數量大於這個值才算是乙個canopy

public canopy(vector center, int canopyid, distancemeasure measure);//建立乙個中心為center,id為canopyid,距離計算公式為measure只有乙個給定點(center)的canopy

canopyclusterer(distancemeasure measure, double t1, double t2) ;//傳入計算距離的方法,t1,t2

public canopyclusterer(configuration config);//傳入乙個configuration,可以用configuration物件中設定距離計算類,t1,t2,t3和t4等引數

public void configure(configuration configuration)

//根據點建立canopy集合

public static listcreatecanopies(listpoints, distancemeasure measure, double t1, double t2)

if (dist < t2)

} for (canopy c : canopies)

} return canopies;

} //將乙個點加入到canopy中

public void addpointtocanopies(vector point, collectioncanopies) to canopy: {}", abstractcluster.formatvector(point, null), canopy.getidentifier());

} canopy.observe(point);

} pointstronglybound = pointstronglybound || dist < t2;

} if (!pointstronglybound) at center:{}", nextcanopyid,

abstractcluster.formatvector(point, null));

} canopies.add(new canopy(point, nextcanopyid++, measure)); //新建乙個canopy加入到canopy列表

} }

public static void run(configuration conf, path input, path output, distancemeasure measure, double t1, double t2, double t3,  

double t4, int clusterfilter, boolean runclustering, double clusterclassificationthreshold, boolean runsequential)

throws ioexception, interruptedexception, classnotfoundexception

} public static path buildclusters(configuration conf, path input, path output, distancemeasure measure, double t1, double t2,

double t3, double t4, int clusterfilter, boolean runsequential) throws ioexception, interruptedexception, classnotfoundexception out: {} measure: {} t1: {} t2: {}", new object );

if (runsequential) else

} //單機聚類

private static path buildclustersseq(path input, path output, distancemeasure measure, double t1, double t2, int clusterfilter)

throws ioexception

path canopyoutputdir = new path(output, cluster.clusters_dir + '0' + cluster.final_iteration_suffix);

path path = new path(canopyoutputdir, "part-r-00000");

sequencefile.writer writer = new sequencefile.writer(fs, conf, path, text.class, clusterwritable.class);

clusterwritable clusterwritable = new clusterwritable();

try center:{} numpoints:{} radius:{}", new object );

}

if (canopy.getnumobservations() > clusterfilter)

} } finally

return canopyoutputdir;

} //分布式聚類

private static path buildclustersmr(configuration conf, path input, path output, distancemeasure measure, double t1, double t2,

double t3, double t4, int clusterfilter) throws ioexception, interruptedexception, classnotfoundexception

return canopyoutputdir;

}

//通過context物件構建乙個canopycluster物件,並設定canopy閥值

protected void setup(context context) throws ioexception,interruptedexception

//將點加入到canopy中

protected void map(writablecomparable<?> key, vectorwritable point,context context) throws ioexception, interruptedexception

//更新canopy引數,將大於閥值的canopy 輸出到reduce map輸出(centroid,每個canopy的中心)

protected void cleanup(context context) throws ioexception,interruptedexception

}super.cleanup(context);

}

//更新t1,t2 

protected void setup(context context) throws ioexception,interruptedexception

//聚類canopy中心,得到最終的canopies reduce輸出(canopy id,乙個canopy)

protected void reduce(text arg0, iterablevalues,context context) throws ioexception, interruptedexception

for (canopy canopy : canopies)

}}

《原始碼閱讀》原始碼閱讀技巧,原始碼閱讀工具

檢視某個類的完整繼承關係 選中類的名稱,然後按f4 quick type hierarchy quick type hierarchy可以顯示出類的繼承結構,包括它的父類和子類 supertype hierarchy supertype hierarchy可以顯示出類的繼承和實現結構,包括它的父類和...

Cartographer原始碼篇 原始碼分析 1

在安裝編譯cartographer 1.0.0的時候,我們可以看到 主要包括cartorgarpher ros cartographer ceres sover三個部分。其中,ceres solver用於非線性優化,求解最小二乘問題 cartographer ros為ros平台的封裝,獲取感測器資料...

python原始碼剖析 Python原始碼剖析

第頁共 頁python 原始碼剖析 物件機制 1.物件 在python 的世界中,一切都是物件,乙個整數是乙個物件,乙個字串也是 乙個物件,更為奇妙的是,型別也是乙個物件,整數型別是乙個物件,字串類 型也是乙個物件。從 年guido 在那個聖誕節揭開 python 世界的大幕開始,一直到現在,pyt...