hdfs檔案統計

2021-10-03 05:20:56 字數 3882 閱讀 5077

以下操作基於hadoop3.1.2、hive3.1.2,其他版本欄位有差異

1、通過hdfs命令匯出csv格式檔案

hdfs dfsadmin -fetchimage myfile

hdfs oiv -i myfile -o fsimage.csv -p delimited

2、到hive建外表及統計

create external table

default

.hdfs_info(

path string,

replication string,

modificationtime string,

accesstime string,

preferredblocksize string,

blockscount string,

filesize string,

nsquota string,

dsquota string,

permission string,

username string,

groupname string)

row format delimited fields

terminated

by'\t'

location

'hdfs://qingfeng_cluster:8020/tmp/prod_hdfs'

;

統計一級目錄大小

select joinedpath, sumsize

from

(select joinedpath,

round

(sum

(filesize)

/1024

/1024

/1024,2

)as sumsize

from

(select concat(

'/',split(path,

'\/')[

1])as joinedpath,accesstime,filesize,username

from

default

.hdfs_info

)tgroup

by joinedpath

)horder

by sumsize desc

;

統計二級目錄大小

select joinedpath, sumsize

from

(select joinedpath,

round

(sum

(filesize)

/1024

/1024

/1024,2

)as sumsize

from

(select concat(

'/',split(path,

'\/')[

1],'/'

,split(path,

'\/')[

2])as joinedpath,accesstime,filesize,username

from

default

.hdfs_info

)tgroup

by joinedpath

)horder

by sumsize desc

;

**目錄下小於100k檔案統計

select concat(

'/',split(path,

'\/')[

1],'/'

,split(path,

'\/')[

2],'/'

,split(path,

'\/')[

3])as path ,

count(*

)as small_file_num

from

(select relative_size,path

from

(select

(case filesize <

100*

1024

when

true

then

'small'

else

'large'

end)

as relative_size, path

from

default

.hdfs_info) tmp

where

relative_size=

'small'

) tmp2

group

by concat(

'/',split(path,

'\/')[

1],'/'

,split(path,

'\/')[

2],'/'

,split(path,

'\/')[

3])order

by small_file_num desc

;

其他各級目錄小檔案統計

select joinedpath,

from_unixtime(ceil(accesstime/

1000),

'yyyy-mm-dd hh:mm:ss'

)as accesstime,

from_unixtime(ceil(modificatetime/

1000),

'yyyy-mm-dd hh:mm:ss'

)as modificatetime,

sumsize

from

(select joinedpath,

min(accesstime)

as accesstime,

max(modificatetime)

as modificatetime,

round

(sum

(filesize)

/1024

/1024

/1024,2

)as sumsize

from

(select concat(

'/',split(path,

'\/')[

1],'/'

,split(path,

'\/')[

2],'/'

,split(path,

'\/')[

3],'/'

,split(path,

'\/')[

4],'/'

,split(path,

'\/')[

5])as joinedpath,

accesstime,

modificatetime,

filesize,

username

from

default

.hdfs_info

where concat(

'/',split(path,

'\/')[

1],'/'

,split(path,

'\/')[

2],'/'

,split(path,

'\/')[

3],'/'

,split(path,

'\/')[

4])=

'/user/hive/warehouse/default.db'

)t where joinedpath !=

'null'

group

by joinedpath)h

order

by sumsize desc

;

合併 hdfs 檔案

待研究,只做儲存 將hdfs中不同目錄下面的資料合在一起,並存放在指定的目錄中,示例如 sqoop merge new data test p1 person onto test p2 person target dir test merged jar file opt data sqoop per...

HDFS 檔案許可權

hadoop fs ls countout rw r r 3 root supergroup 1311 2018 06 20 00 11 coun rw r r 3root supergroup 1311 2018 06 20 00 11 countout part r 00000 檔案訪問許可權 ...

hdfs檔案上傳

登陸hdfs的檔案埠查詢檔案路徑,登陸埠預設為50070 hadoop fs ls hadoop dfs mkdir tmp data1.終端輸入 vim test1.txt鍵入內容儲存wq 2.將伺服器上路徑資料為 test 1.txt 的傳輸到hdfs中的 tmp data下 hadoop fs...