<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: HDFS dir cleanup which older than 7 days in python script in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/341996#M233672</link>
    <description>&lt;P&gt;Hi ,&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;The below source code removes files that are older than 3 days from the HDFS path&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt; #!/bin/sh   
 today=`date +'%s'`  
 hdfs dfs -ls /file/Path/ | while read line ; do  
 dir_date=$(echo ${line} | awk '{print $6}')  
 difference=$(( ( ${today} - $(date -d ${dir_date} +%s) ) / ( 24*60*60 ) ))  
 filePath=$(echo ${line} | awk '{print $8}')  
 if [ ${difference} -gt 3 ]; then  
   hdfs dfs -rm -r $filePath  
 fi  
 done &lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;hdfs dfs -rm -r command moves the data to the trash folder if the trash mechanism is configured.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;To ignore moving the file to trash folder use skipTrash option.&lt;/P&gt;</description>
    <pubDate>Fri, 22 Apr 2022 11:25:13 GMT</pubDate>
    <dc:creator>ggangadharan</dc:creator>
    <dc:date>2022-04-22T11:25:13Z</dc:date>
    <item>
      <title>HDFS dir cleanup which older than 7 days in python script</title>
      <link>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/341810#M233614</link>
      <description>&lt;P&gt;Hi Guyz,&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;please help me to build the python script for cleaning HDFS dir which are older than 3 days.&lt;/P&gt;&lt;P&gt;give me the suggestions&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 20 Apr 2022 12:21:04 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/341810#M233614</guid>
      <dc:creator>arati</dc:creator>
      <dc:date>2022-04-20T12:21:04Z</dc:date>
    </item>
    <item>
      <title>Re: HDFS dir cleanup which older than 7 days in python script</title>
      <link>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/341996#M233672</link>
      <description>&lt;P&gt;Hi ,&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;The below source code removes files that are older than 3 days from the HDFS path&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt; #!/bin/sh   
 today=`date +'%s'`  
 hdfs dfs -ls /file/Path/ | while read line ; do  
 dir_date=$(echo ${line} | awk '{print $6}')  
 difference=$(( ( ${today} - $(date -d ${dir_date} +%s) ) / ( 24*60*60 ) ))  
 filePath=$(echo ${line} | awk '{print $8}')  
 if [ ${difference} -gt 3 ]; then  
   hdfs dfs -rm -r $filePath  
 fi  
 done &lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;hdfs dfs -rm -r command moves the data to the trash folder if the trash mechanism is configured.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;To ignore moving the file to trash folder use skipTrash option.&lt;/P&gt;</description>
      <pubDate>Fri, 22 Apr 2022 11:25:13 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/341996#M233672</guid>
      <dc:creator>ggangadharan</dc:creator>
      <dc:date>2022-04-22T11:25:13Z</dc:date>
    </item>
    <item>
      <title>Re: HDFS dir cleanup which older than 7 days in python script</title>
      <link>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/342067#M233684</link>
      <description>Thanks for the solution, it will help a lot but i need help to built python&lt;BR /&gt;script foe this hdfs cleanup&lt;BR /&gt;plz help in that also&lt;BR /&gt;</description>
      <pubDate>Sat, 23 Apr 2022 07:33:16 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/342067#M233684</guid>
      <dc:creator>arati</dc:creator>
      <dc:date>2022-04-23T07:33:16Z</dc:date>
    </item>
    <item>
      <title>Re: HDFS dir cleanup which older than 7 days in python script</title>
      <link>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/342318#M233712</link>
      <description>&lt;P&gt;Hi,&lt;BR /&gt;&lt;BR /&gt;From shell find the files that needs to be deleted and save them in a temp file like below,&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt; #!/bin/sh   
 today=`date +'%s'`  
 hdfs dfs -ls /file/Path/ | while read line ; do  
 dir_date=$(echo ${line} | awk '{print $6}')  
 difference=$(( ( ${today} - $(date -d ${dir_date} +%s) ) / ( 24*60*60 ) ))  
 filePath=$(echo ${line} | awk '{print $8}')  
 if [ ${difference} -gt 3 ]; then  
   echo -e "$filePath" &amp;gt;&amp;gt; toDelete
 fi  
 done&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;Then&amp;nbsp;execute arbitrary shell command using form example subprocess.call or sh library so something like below&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;import subprocess

file = open('toDelete', 'r')
for each in file:
	subprocess.call(["hadoop", "fs", "-rm", "-f", each])&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;Also, you can use hdfs fs API in PySpark like below,&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()

def delete_path(spark, path):
    sc = spark.sparkContext
    fs = (sc._jvm.org
          .apache.hadoop
          .fs.FileSystem
          .get(sc._jsc.hadoopConfiguration())
          )
    fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True)

delete_path(spark, "Your/hdfs/path")&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 26 Apr 2022 06:15:31 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/342318#M233712</guid>
      <dc:creator>ggangadharan</dc:creator>
      <dc:date>2022-04-26T06:15:31Z</dc:date>
    </item>
    <item>
      <title>Re: HDFS dir cleanup which older than 7 days in python script</title>
      <link>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/342696#M233796</link>
      <description>&lt;P&gt;Hi ggangadharan&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I follow the way u suggest, but now i need help in code that we can execute that hdfscode shell script from python script.&lt;/P&gt;&lt;P&gt;and do it as an&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;import subprocess&amp;nbsp;&lt;/P&gt;&lt;P&gt;subprocess.call('./home/test.sh/' ,shell=True)&lt;/P&gt;&lt;P&gt;file = open('toDelete, 'r')&lt;/P&gt;&lt;P&gt;for each in file:&lt;BR /&gt;subprocess.call(["hadoop", "fs", "-rm", "-f", each])&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;but now my shell script is not executing and not showing any output ,plz suggest me what i do.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;thanks&lt;/P&gt;</description>
      <pubDate>Fri, 29 Apr 2022 10:32:54 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/HDFS-dir-cleanup-which-older-than-7-days-in-python-script/m-p/342696#M233796</guid>
      <dc:creator>arati</dc:creator>
      <dc:date>2022-04-29T10:32:54Z</dc:date>
    </item>
  </channel>
</rss>

