2023年寒假學習進度第十天

2022-08-23 01:30:18 字數 4708 閱讀 6224

今天主要學習了實驗 7 spark 機器學習庫 mllib 程式設計實踐,

主要**:

import org.apache.spark.ml.feature.pca

import org.apache.spark.sql.row

import org.apache.spark.ml.linalg.

import org.apache.spark.ml.evaluation.multiclassclassificationevaluator

import org.apache.spark.ml.

import org.apache.spark.ml.feature.

import org.apache.spark.ml.classification.logisticregression

import org.apache.spark.ml.classification.logisticregressionmodel

import org.apache.spark.ml.classification.

import org.apache.spark.sql.functions;

import spark.implicits._

case class adult(features: org.apache.spark.ml.linalg.vector, label: string)

val df = sc.textfile("adult.data.txt").map(_.split(",")).map(p =>adult(vectors.dense(p(0).todouble,p(2).todouble,p(4).todouble, p(10).todouble, p(11).todouble, p(12).todouble), p(14).tostring())).todf()

val test = sc.textfile("adult.test.txt").map(_.split(",")).map(p =>adult(vectors.dense(p(0).todouble,p(2).todouble,p(4).todouble, p(10).todouble, p(11).todouble, p(12).todouble), p(14).tostring())).todf()

val pca = new pca().setinputcol("features").setoutputcol("pcafeatures").setk(3).fit(df)

val result = pca.transform(df)

val testdata = pca.transform(test)

result.show(false)

testdata.show(false)

val labelindexer = new stringindexer().setinputcol("label").setoutputcol("indexedlabel").fit(result)

labelindexer.labels.foreach(println)

val featureindexer = new vectorindexer().setinputcol("pcafeatures").setoutputcol("indexedfeatures").fit(result)

println(featureindexer.numfeatures)

val labelconverter = new indextostring().setinputcol("prediction").setoutputcol("predictedlabel").setlabels(labelindexer.labels)

val lr = new logisticregression().setlabelcol("indexedlabel").setfeaturescol("indexedfeatures").setmaxiter(100)

val lrpipeline = new pipeline().setstages(array(labelindexer, featureindexer, lr, labelconverter))

val lrpipelinemodel = lrpipeline.fit(result)

val lrmodel = lrpipelinemodel.stages(2).asinstanceof[logisticregressionmodel]

println("coefficients: " + lrmodel.coefficientmatrix+"intercept: "+lrmodel.interceptvector+"numclasses: "+lrmodel.numclasses+"numfeatures: "+lrmodel.numfeatures)

val lrpredictions = lrpipelinemodel.transform(testdata)

val evaluator = new multiclassclassificationevaluator().setlabelcol("indexedlabel").setpredictioncol("prediction")

val lraccuracy = evaluator.evaluate(lrpredictions)

println("test error = " + (1.0 - lraccuracy))

val pca = new pca().setinputcol("features").setoutputcol("pcafeatures")

val labelindexer = new stringindexer().setinputcol("label").setoutputcol("indexedlabel").fit(df)

val featureindexer = new vectorindexer().setinputcol("pcafeatures").setoutputcol("indexedfeatures")

val labelconverter = new indextostring().setinputcol("prediction").setoutputcol("predictedlabel").setlabels(labelindexer.labels)

val lr = new logisticregression().setlabelcol("indexedlabel").setfeaturescol("indexedfeatures").setmaxiter(100)

val lrpipeline = new pipeline().setstages(array(pca, labelindexer, featureindexer, lr, labelconverter))

val paramgrid = new paramgridbuilder().addgrid(pca.k, array(1,2,3,4,5,6)).addgrid(lr.elasticnetparam, array(0.2,0.8)).addgrid(lr.regparam, array(0.01, 0.1, 0.5)).build()

val cv = new crossvalidator().setestimator(lrpipeline).setevaluator(new multiclassclassificationevaluator().setlabelcol("indexedlabel").setpredictioncol("prediction")).setestimatorparammaps(paramgrid).setnumfolds(3)

val cvmodel = cv.fit(df)

val lrpredictions=cvmodel.transform(test)

val evaluator = new multiclassclassificationevaluator().setlabelcol("indexedlabel").setpredictioncol("prediction")

val lraccuracy = evaluator.evaluate(lrpredictions)

println("準確率為"+lraccuracy)

val bestmodel= cvmodel.bestmodel.asinstanceof[pipelinemodel]

val lrmodel = bestmodel.stages(3).asinstanceof[logisticregressionmodel]

println("coefficients: " + lrmodel.coefficientmatrix + "intercept: "+lrmodel.interceptvector+ "numclasses: "+lrmodel.numclasses+"numfeatures: "+lrmodel.numfeatures)

val pcamodel = bestmodel.stages(0).asinstanceof[pcamodel]

println("primary component: " + pcamodel.pc)

在繼續這個實驗時遇到乙個問題,現在還沒解決,如圖:

經過查詢這個問題的原因是無法執行定義的函式,但是我完全按照教程中的**進行就會產生這個問題,網上沒有這個問題的解析,所以還未解決。

學習第十天

一 介面 jdk1.8及之後新增了2中可以定義存在方法體的方法 預設方法 default關鍵字修飾的方法 使用 通過實現類物件使用 靜態方法 使用 通過介面名去呼叫 二 單例模式 保證類只能存在乙個例項 餓漢式 先建立物件,然後需要的人要這個物件,保證永遠使用的都是這個建立好的物件 執行緒安全的,效...

python學習第十天

class student count 0 def init self,name,age,address self.name name self.age age self.address address student.count 1 k print k w open a.txt w encodin...

菜鳥學習第十天

1.字串最大的特點 一旦初始化就不可以改變。不可改變的字串內容而不是指向字串的引用 2.string s abc 其中s是乙個類型別變數,abc 是乙個物件。3.string s1 abc 和string s2 new string abc s1 s2 和s1.equals s2 比較的是他們在記憶...