1. Fetch the latest FS Image from the Active NameNode:
Look at the (NameNode directories) property in Ambari and copy the latest image to a node with free disk space and memory. (Ex: fsimage_0000000001138083674)
2. Load the FS Image:
On the node where you copied the FS Image. Run the below commands:
hive> load data inpath ‘/user/hdfs/lsr/lsrreport.txt’ overwrite into table lsr;
hive> create view lsr_view as select (case substr(permissions,1,1) when 'd' then 'dir' else 'file' end) as file_type,owner,cast(size as int) as size, fileaccessdate,time,file_path from lsr;
Query 1: Files < 1 MB (Top 100)
hive> select relative_size,fileaccessdate,file_path as total from (select (case size < 1048576 when true then 'small' else 'large' end) as relative_size,fileaccessdate,file_path from lsr_view where file_type='file') tmp where relative_size='small' limit 100;
Query 1: Files < 1 MB (Grouped by Path)
hive> select substr(file_path,1,45) ,count(*) from (select relative_size,fileaccessdate,file_path from (select (case size < 1048576 when true then 'small' else 'large' end) as relative_size,fileaccessdate,file_path from lsr_view where file_type='file') tmp where relative_size='small') tmp2 group by substr(file_path,1,45) order by 2 desc;
Query 1: Files < 1 KByte (Grouped by Owner)
hive> select owner ,count(1) from (select (case size < 1024 when true then 'small' else 'large' end) as relative_size,fileaccessdate,owner from lsr_view where file_type='file') tmp where relative_size='small' group by owner;
Query 1: Files < 1 KByte (Grouped by Date)
hive> select fileaccessdate ,count(1) from (select (case size < 1024 when true then 'small' else 'large' end) as relative_size,fileaccessdate,owner from lsr_view where file_type='file' ) tmp where relative_size='small' group by fileaccessdate;