HIVE專案實戰

2021-09-27 07:59:42 字數 4278 閱讀 3731

字段

備註詳細描述

video id

11位字串

uploader

agecategory

length

views

**次數

rate

滿分5分

ratings

流量conments

related ids

2.使用者表

表6-14 使用者表

字段備註

字段型別

uploader

上傳者使用者名稱

string

videos

intfriends

朋友數量

int

public string datarinse

(string str)

//將使用者中的空格替換掉

split[3]

= split[3]

.replaceall

(" ",""

);stringbuilder stringbuilder =

newstringbuilder()

;for

(int i =

0; i < split.length; i++

)else

}else

else}}

return stringbuilder.

tostring()

;}public

static

void

main

(string[

] args)

public

class

extends

}

public

class

etldriver

implements

tool

public

void

setconf

(configuration conf)

public configuration getconf()

public

static

void

main

(string[

] args)

throws exception

}

create table gulivideo_ori(

videoid string,

uploader string,

age int,

category array,

length int,

views int,

rate float,

ratings int,

comments int,

relatedid array)

row format delimited

fields terminated by "\t"

collection items terminated by "&"

stored as textfile;

create table gulivideo_user_ori(

uploader string,

videos int,

friends int)

row format delimited

fields terminated by "\t"

stored as textfile;

select uploader,views

from gulivideo_ori

order by views desc

limit 10;

select	3.取出前十

t3.cate,t3.cou_cate

from

(select 2.統計沒類的熱度

t2.cate cate , count(*) cou_cate

from

(select t1.ca cate 1.將類別炸開

from gulivideo_ori lateral view explode(category) t1 as ca

)t2group by t2.cate

)t3order by t3.cou_cate

limit 10

select	3.對相同類別去重

distinct(cate)

from

(cate,views,videoid

from

(select t1.ca cate,videoid,views 1.將類別炸開

from gulivideo_ori lateral view explode(category) t1 as ca

)t2order by views desc

limit 20

)t3

select 	5.排序rank

*from

(select 4.將合併的表的類別字段炸開,對組進行分組,統計count

t4.category , count(*) hot

from

(select 3.然後與原表再連線join

*from

(distinct(relatedids_name)

from

(select * 1.取出**次數top50

from gulivideo_ori

order by views desc

limit 50

)t1lateral view explode(t1.relatedid) relatedids_t as relatedids_name

)t2join gulivideo_ori t3

where t2.relatedids_name=t3.videoid

)t4lateral view explode(t4.category) category_t as category_name

group by t4.category

)t5order by t5.hot desc

select 	2.取出top10

t1.categoryid,

t1.views,

from

(select 1.按類別分組,**次數排序

categoryid,

views,

row_number() over(partition by categoryid order by views desc) rank

from gulivideo_category

)t1where rank <= 10

select 

t1.categoryid,

t1.ratings

from

(select

categoryid,

ratings,

row_number() over(partition by categoryid order by ratings desc) rank

from gulivideo_category

)t1where rank <= 10

select

t2.uploader,

t2.views

from

(select

*from gulivideo_user_ori

order by videos desc

limit 20

)t1join

(select

*from gulivideo_ori

)t2where t1.uploader=t2.uploader

order by views desc

limit 20

select 

t1.categoryid,

t1.views,

from

(select

categoryid,

views,

row_number() over(partition by categoryid order by views desc) rank

from gulivideo_category

)t1where rank <= 10

Hive專案實戰三

這裡總共需要建立4張表,明明只有兩個資料檔案,為什麼要建立4張表呢?因為這裡建立的表要使用orc的壓縮方式,而不使用預設的textfile的方式,orc的壓縮方式要想向表中匯入資料需要使用子查詢的方式匯入,即把從另一張表中查詢到的資料插入orc壓縮格式的表匯中,所以這裡需要四張表,兩張textfil...

hive 專案實戰 2

建表 建立表這裡總共需要建立4張表,明明只有兩個資料檔案,為什麼要建立4張表呢?因為這裡建立的表要使用orc的壓縮方式,而不使用預設的textfile的方式,orc的壓縮方式要想向表中匯入資料需要使用子查詢的方式匯入,即把從另一張表中查詢到的資料插入orc壓縮格式的表匯中,所以這裡需要四張表,兩張t...

Hive專案實戰一

1.需求描述 2.資料來源結構說明 資料來源1 user.txt 資料樣例 資料樣例中的三個字段結構 上傳者使用者名稱 string int朋友數量 int資料來源2 video.txt 資料樣例 fqshwyqgqsw lonelygirl15736 people blogs133 151763 ...