<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>question Re: How can one change block size for large existing HDFS data set? in Support Questions</title>
    <link>https://community.cloudera.com/t5/Support-Questions/How-can-one-change-block-size-for-large-existing-HDFS-data/m-p/97339#M60557</link>
    <description>&lt;P&gt;So truly in-place is impossible---but it sounds like if the data were partitioned one could execute the distcp on one partition at a time, deleting each original partition after it is copied. Thanks man.&lt;/P&gt;</description>
    <pubDate>Fri, 20 Nov 2015 09:07:00 GMT</pubDate>
    <dc:creator>pcoates</dc:creator>
    <dc:date>2015-11-20T09:07:00Z</dc:date>
    <item>
      <title>How can one change block size for large existing HDFS data set?</title>
      <link>https://community.cloudera.com/t5/Support-Questions/How-can-one-change-block-size-for-large-existing-HDFS-data/m-p/97337#M60555</link>
      <description>&lt;P&gt;I have hundreds of thousands of small data blocks (&amp;lt; 64MB) that I'd like to turn into a more manageable number of larger blocks, say, 128MB or 256MB. This is CSV data.  How can I do this with a distributed job, and can it be done "in place", i.e.,  without temporarily doubling the space requirement?  &lt;/P&gt;</description>
      <pubDate>Fri, 20 Nov 2015 05:40:37 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/How-can-one-change-block-size-for-large-existing-HDFS-data/m-p/97337#M60555</guid>
      <dc:creator>pcoates</dc:creator>
      <dc:date>2015-11-20T05:40:37Z</dc:date>
    </item>
    <item>
      <title>Re: How can one change block size for large existing HDFS data set?</title>
      <link>https://community.cloudera.com/t5/Support-Questions/How-can-one-change-block-size-for-large-existing-HDFS-data/m-p/97338#M60556</link>
      <description>&lt;P&gt;There is no effective way to change block size "in place".  The concept of block size is tightly tied to the on-disk layout of block files at DataNodes, so it's non-trivial to change this.&lt;/P&gt;&lt;P&gt;As far as running a distributed job to do this, it's possible to use distcp with an override of the block size on the command line.  (See example below.)  This does however cause a temporary doubling of the storage consumed.&lt;/P&gt;&lt;PRE&gt;&amp;gt; hadoop distcp -D dfs.blocksize=268435456 /input /output



&amp;gt; hdfs dfs -stat 'name=%n blocksize=%o' /input/hello

name=hello blocksize=134217728



&amp;gt; hdfs dfs -stat 'name=%n blocksize=%o' /output/hello

name=hello blocksize=268435456&lt;/PRE&gt;</description>
      <pubDate>Fri, 20 Nov 2015 06:19:22 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/How-can-one-change-block-size-for-large-existing-HDFS-data/m-p/97338#M60556</guid>
      <dc:creator>cnauroth</dc:creator>
      <dc:date>2015-11-20T06:19:22Z</dc:date>
    </item>
    <item>
      <title>Re: How can one change block size for large existing HDFS data set?</title>
      <link>https://community.cloudera.com/t5/Support-Questions/How-can-one-change-block-size-for-large-existing-HDFS-data/m-p/97339#M60557</link>
      <description>&lt;P&gt;So truly in-place is impossible---but it sounds like if the data were partitioned one could execute the distcp on one partition at a time, deleting each original partition after it is copied. Thanks man.&lt;/P&gt;</description>
      <pubDate>Fri, 20 Nov 2015 09:07:00 GMT</pubDate>
      <guid>https://community.cloudera.com/t5/Support-Questions/How-can-one-change-block-size-for-large-existing-HDFS-data/m-p/97339#M60557</guid>
      <dc:creator>pcoates</dc:creator>
      <dc:date>2015-11-20T09:07:00Z</dc:date>
    </item>
  </channel>
</rss>

