隨機森林 邏輯回歸 貝葉斯等演算法的組合使用

2021-09-10 04:46:49 字數 4946 閱讀 6282

隨機森林+邏輯回歸+貝葉斯

1.gbdt的思想使其具有天然優勢可以發現多種有區分性的特徵以及特徵組合。使用其來自動發現有效的特徵、特徵組合,來作為lr模型中的特徵,以提高 ctr預估(click-through rate prediction)的準確性

2.這個程式主要是為了提公升特徵的準確性,篩選有效特徵。其次是鍛鍊混合演算法的使用,避免單一演算法的侷限性。大家可以按照路子,隨意修改或者組合自己想要用的演算法,提公升準確度。

3.normalizer() 正則化也是規範特徵的一種常用的方式

4.提公升演算法的準確度有兩個出發點:

a、演算法本身的選擇、組合、優化。

b、資料的處理、特徵的處理、資料的準確性等

import org.apache.log4j.

import org.apache.spark.ml.pipeline

import org.apache.spark.ml.classification._

import org.apache.spark.ml.evaluation.multiclassclassificationevaluator

import org.apache.spark.ml.feature._

import org.apache.spark.sql.sparksession

//資料轉df

val input_reco_theme = spark.read.textfile(inputdata01)

val reco_theme = input_reco_theme.rdd.map(x =>

}).todf()

reco_theme.show()

reco_theme.createorreplacetempview("input_data")

val result_valuse = spark.sql(

"""|select //n個維度,label

|from (

|    select *,

|        row_number() over (partition by table1.label order by table1.rst_0) as rank

|    from (//沒啥用的子查詢) table1

|) table2

|where table2.rank <= 10000

""".stripmargin)

val ratings2 = result_valuse.todf()

ratings2.show()

//stringindexer 將字串對映int

val converter_pipeline = new pipeline().setstages(array(

new stringindexer()

.setinputcol("xaid").setoutputcol("xaidindex"),

new stringindexer()

.setinputcol("brand").setoutputcol("brandindex"),

new stringindexer()

.setinputcol("model").setoutputcol("modelindex")

))def getconverterpipline: pipeline = converter_pipeline

val cluster_info_split_table = getconverterpipline.fit(ratings2).transform(ratings2)

cluster_info_split_table.show()

val df1 = cluster_info_split_table.select(//n個維度)

df1.show()

//df轉成features-label格式

val assembler: vectorassembler = new vectorassembler()

.setinputcols(array(//n個維度))

.setoutputcol("features")

val df2 = assembler.transform(df1).select("xaidindex","label","features")

df2.limit(10).rdd.foreach(println)

//正則化

val normalizer = new normalizer().setinputcol("features").setoutputcol("normfeatures").setp(1.0)

val l1normdata = normalizer.transform(df2)

l1normdata.show()

//split the data into training and test sets (30% held out for testing).

val array(trainingdata, testdata) = l1normdata.randomsplit(array(0.7, 0.3))

// train a gbt model.

val rf = new randomforestclassifier()

.setlabelcol("label")

.setfeaturescol("normfeatures")

.setnumtrees(9)

// train model. this also runs the indexers.

val model = rf.fit(trainingdata)

val predictions = model.transform(testdata)

predictions.show()

// select (prediction, true label) and compute test error.

val evaluator = new multiclassclassificationevaluator()

.setlabelcol("label")

.setpredictioncol("prediction")

.setmetricname("accuracy")

val accuracy = evaluator.evaluate(predictions)

println(s"test error = $")

val s=model.featureimportances.toarray.toseq

var result_for= for(i<- 0 to s.length-1; if(s(i)!=0) )yield i

val slicer = new vectorslicer().setinputcol("features").setoutputcol("features_select")

.setindices(result_for.toarray)

val trainingdata_new=slicer.transform(trainingdata)

trainingdata_new.printschema()

trainingdata_new.show()

val test_new=slicer.transform(testdata)

var labmda=0

var gbgt_lr = new logisticregression()

.setfeaturescol("features_select")

.setlabelcol("label")

.setmaxiter(100)

.setstandardization(false)

.setregparam(labmda)

.setelasticnetparam(1)

var modellr_gbgt = gbgt_lr.fit(trainingdata_new)

var trainpredictionsd = modellr_gbgt.transform(trainingdata_new)

trainpredictionsd.show()

val trainingsummary = modellr_gbgt.summary

val objectivehistory = trainingsummary.objectivehistory

println("objectivehistory:")

objectivehistory.foreach(println)

val accuracy2 = trainingsummary.accuracy//精準度

println(s"accuracy: $accuracy2")

val model2 = new *****bayes()

.setfeaturescol("features_select")

.setlabelcol("label")

.setsmoothing(0.01)

.fit(trainingdata_new)

// select example rows to display.

val predictions2 = model2.transform(test_new)

predictions2.show()

// select (prediction, true label) and compute test error

val evaluator2 = new multiclassclassificationevaluator()

.setlabelcol("label")

.setpredictioncol("prediction")

.setmetricname("accuracy")

val accuracy3 = evaluator2.evaluate(predictions)

println(s"test set accuracy = $accuracy3")}}

大資料、資料分析、爬蟲群: 《453908562》

貝葉斯演算法的應用

聚類分析實戰 手寫體數字的識別 coding utf 8 time 2019 10 23 13 25 author hxf email 1870212598 qq.com file bayes use.py description 貝葉斯演算法的應用 貝葉斯演算法的應用 from numpy imp...

邏輯斯蒂回歸VS決策樹VS隨機森林

lr 與svm 不同 1.logistic regression適合需要得到乙個分類概率的場景,svm則沒有分類概率 2.lr其實同樣可以使用kernel,但是lr沒有support vector在計算複雜度上會高出很多。如果樣本量很大並且需要的是乙個複雜模型,那麼建議svm 3.如果樣本比較少,模...

ML 高斯判別分析 樸素貝葉斯和邏輯回歸

華電北風吹 天津大學認知計算與應用重點實驗室 最後修改日期 2015 8 22 近來看 中經常看到gda和樸素貝葉斯,並且 中說的演算法中用到的貝葉斯公式,對怎麼用的原理以前沒有仔細研究,今天仔細的看了斯坦福機器學習的關於gda,nb和lr的講義部分。理解了貝葉斯公式在gda和nb中的原理,以及gd...