FileDistributionCalculator.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.tools.offlineImageViewer;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.RandomAccessFile;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.BlockProto;
import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SectionName;
import org.apache.hadoop.hdfs.server.namenode.FSImageUtil;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
import org.apache.hadoop.util.LimitInputStream;
import org.apache.hadoop.util.StringUtils;

import org.apache.hadoop.util.Preconditions;

/**
 * This is the tool for analyzing file sizes in the namespace image. In order to
 * run the tool one should define a range of integers <code>[0, maxSize]</code> by
 * specifying <code>maxSize</code> and a <code>step</code>. The range of integers is
 * divided into segments of size <code>step</code>:
 * <code>[0, s<sub>1</sub>, ..., s<sub>n-1</sub>, maxSize]</code>, and the visitor
 * calculates how many files in the system fall into each segment
 * <code>[s<sub>i-1</sub>, s<sub>i</sub>)</code>. Note that files larger than
 * <code>maxSize</code> always fall into the very last segment.
 *
 * <h3>Input.</h3>
 * <ul>
 * <li><code>filename</code> specifies the location of the image file;</li>
 * <li><code>maxSize</code> determines the range <code>[0, maxSize]</code> of files
 * sizes considered by the visitor;</li>
 * <li><code>step</code> the range is divided into segments of size step.</li>
 * </ul>
 *
 * <h3>Output.</h3> The output file is formatted as a tab separated two column
 * table: Size and NumFiles. Where Size represents the start of the segment, and
 * numFiles is the number of files form the image which size falls in this
 * segment.
 *
 */
final class FileDistributionCalculator {
  private final static long MAX_SIZE_DEFAULT = 0x2000000000L; // 1/8 TB = 2^37
  private final static int INTERVAL_DEFAULT = 0x200000; // 2 MB = 2^21
  private final static int MAX_INTERVALS = 0x8000000; // 128 M = 2^27

  private final Configuration conf;
  private final long maxSize;
  private final int steps;
  private final PrintStream out;

  private final int[] distribution;
  private int totalFiles;
  private int totalDirectories;
  private int totalBlocks;
  private long totalSpace;
  private long maxFileSize;

  private boolean formatOutput = false;

  FileDistributionCalculator(Configuration conf, long maxSize, int steps,
      boolean formatOutput, PrintStream out) {
    this.conf = conf;
    this.maxSize = maxSize == 0 ? MAX_SIZE_DEFAULT : maxSize;
    this.steps = steps == 0 ? INTERVAL_DEFAULT : steps;
    this.formatOutput = formatOutput;
    this.out = out;
    long numIntervals = this.maxSize / this.steps;
    // avoid OutOfMemoryError when allocating an array
    Preconditions.checkState(numIntervals <= MAX_INTERVALS,
        "Too many distribution intervals (maxSize/step): " + numIntervals +
        ", should be less than " + (MAX_INTERVALS+1) + ".");
    this.distribution = new int[1 + (int) (numIntervals)];
  }

  void visit(RandomAccessFile file) throws IOException {
    if (!FSImageUtil.checkFileFormat(file)) {
      throw new IOException("Unrecognized FSImage");
    }

    FileSummary summary = FSImageUtil.loadSummary(file);
    try (FileInputStream in = new FileInputStream(file.getFD())) {
      for (FileSummary.Section s : summary.getSectionsList()) {
        if (SectionName.fromString(s.getName()) != SectionName.INODE) {
          continue;
        }

        in.getChannel().position(s.getOffset());
        InputStream is = FSImageUtil.wrapInputStreamForCompression(conf,
            summary.getCodec(), new BufferedInputStream(new LimitInputStream(
                in, s.getLength())));
        run(is);
        output();
      }
    }
  }

  private void run(InputStream in) throws IOException {
    INodeSection s = INodeSection.parseDelimitedFrom(in);
    for (int i = 0; i < s.getNumInodes(); ++i) {
      INodeSection.INode p = INodeSection.INode.parseDelimitedFrom(in);
      if (p.getType() == INodeSection.INode.Type.FILE) {
        ++totalFiles;
        INodeSection.INodeFile f = p.getFile();
        totalBlocks += f.getBlocksCount();
        long fileSize = 0;
        for (BlockProto b : f.getBlocksList()) {
          fileSize += b.getNumBytes();
        }
        maxFileSize = Math.max(fileSize, maxFileSize);
        totalSpace += fileSize * f.getReplication();

        int bucket = fileSize > maxSize ? distribution.length - 1 : (int) Math
            .ceil((double)fileSize / steps);
        // Compare the bucket value with distribution's length again,
        // because sometimes the bucket value will be equal to
        // the length when maxSize can't be divided completely by step.
        if (bucket >= distribution.length) {
          bucket = distribution.length - 1;
        }
        ++distribution[bucket];

      } else if (p.getType() == INodeSection.INode.Type.DIRECTORY) {
        ++totalDirectories;
      }

      if (i % (1 << 20) == 0) {
        out.println("Processed " + i + " inodes.");
      }
    }
  }

  private void output() {
    // write the distribution into the output file
    out.print((formatOutput ? "Size Range" : "Size") + "\tNumFiles\n");
    for (int i = 0; i < distribution.length; i++) {
      if (distribution[i] != 0) {
        if (formatOutput) {
          out.print((i == 0 ? "[" : "(")
              + StringUtils.byteDesc(((long) (i == 0 ? 0 : i - 1) * steps))
              + ", "
              + StringUtils.byteDesc((long)
                  (i == distribution.length - 1 ? maxFileSize :
                      (long) i * steps)) + "]\t" + distribution[i]);
        } else {
          out.print(((long) i * steps) + "\t" + distribution[i]);
        }

        out.print('\n');
      }
    }
    out.print("totalFiles = " + totalFiles + "\n");
    out.print("totalDirectories = " + totalDirectories + "\n");
    out.print("totalBlocks = " + totalBlocks + "\n");
    out.print("totalSpace = " + totalSpace + "\n");
    out.print("maxFileSize = " + maxFileSize + "\n");
  }
}