FileDistributionVisitor.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.tools.offlineImageViewer;

import java.io.IOException;
import java.util.LinkedList;

import org.apache.hadoop.util.StringUtils;

/**
 * File size distribution visitor.
 * 
 * <h3>Description.</h3>
 * This is the tool for analyzing file sizes in the namespace image.
 * In order to run the tool one should define a range of integers
 * <code>[0, maxSize]</code> by specifying <code>maxSize</code> and a <code>step</code>.
 * The range of integers is divided into segments of size <code>step</code>: 
 * <code>[0, s<sub>1</sub>, ..., s<sub>n-1</sub>, maxSize]</code>,
 * and the visitor calculates how many files in the system fall into 
 * each segment <code>[s<sub>i-1</sub>, s<sub>i</sub>)</code>. 
 * Note that files larger than <code>maxSize</code> always fall into 
 * the very last segment.
 * 
 * <h3>Input.</h3>
 * <ul>
 * <li><code>filename</code> specifies the location of the image file;</li>
 * <li><code>maxSize</code> determines the range <code>[0, maxSize]</code> of files
 * sizes considered by the visitor;</li>
 * <li><code>step</code> the range is divided into segments of size step.</li>
 * </ul>
 *
 * <h3>Output.</h3>
 * The output file is formatted as a tab separated two column table:
 * Size and NumFiles. Where Size represents the start of the segment,
 * and numFiles is the number of files form the image which size falls in 
 * this segment.
 */
class FileDistributionVisitor extends TextWriterImageVisitor {
  final private LinkedList<ImageElement> elemS = new LinkedList<ImageElement>();

  private final static long MAX_SIZE_DEFAULT = 0x2000000000L;   // 1/8 TB = 2^37
  private final static int INTERVAL_DEFAULT = 0x200000;         // 2 MB = 2^21

  private int[] distribution;
  private long maxSize;
  private int step;

  private int totalFiles;
  private int totalDirectories;
  private int totalBlocks;
  private long totalSpace;
  private long maxFileSize;

  private FileContext current;

  private boolean inInode = false;
  private boolean formatOutput = false;

  /**
   * File or directory information.
   */
  private static class FileContext {
    String path;
    long fileSize;
    int numBlocks;
    int replication;
  }

  public FileDistributionVisitor(String filename, long maxSize, int step,
      boolean formatOutput) throws IOException {
    super(filename, false);
    this.maxSize = (maxSize == 0 ? MAX_SIZE_DEFAULT : maxSize);
    this.step = (step == 0 ? INTERVAL_DEFAULT : step);
    this.formatOutput = formatOutput;
    long numIntervals = this.maxSize / this.step;
    if(numIntervals >= Integer.MAX_VALUE)
      throw new IOException("Too many distribution intervals " + numIntervals);
    this.distribution = new int[1 + (int)(numIntervals)];
    this.totalFiles = 0;
    this.totalDirectories = 0;
    this.totalBlocks = 0;
    this.totalSpace = 0;
    this.maxFileSize = 0;
  }

  @Override
  void start() throws IOException {}

  @Override
  void finish() throws IOException {
    output();
    super.finish();
  }

  @Override
  void finishAbnormally() throws IOException {
    System.out.println("*** Image processing finished abnormally.  Ending ***");
    output();
    super.finishAbnormally();
  }

  private void output() throws IOException {
    // write the distribution into the output file
    write((formatOutput ? "Size Range" : "Size") + "\tNumFiles\n");
    for (int i = 0; i < distribution.length; i++) {
      if (distribution[i] > 0) {
        if (formatOutput) {
          write((i == 0 ? "[" : "(")
              + StringUtils.byteDesc(((long) (i == 0 ? 0 : i - 1) * step))
              + ", "
              + StringUtils.byteDesc((long)
                  (i == distribution.length - 1 ? maxFileSize : i * step))
                  + "]\t"
              + distribution[i] + "\n");
        } else {
          write(((long) i * step) + "\t" + distribution[i] + "\n");
        }
      }
    }
    System.out.println("totalFiles = " + totalFiles);
    System.out.println("totalDirectories = " + totalDirectories);
    System.out.println("totalBlocks = " + totalBlocks);
    System.out.println("totalSpace = " + totalSpace);
    System.out.println("maxFileSize = " + maxFileSize);
  }

  @Override
  void leaveEnclosingElement() throws IOException {
    ImageElement elem = elemS.pop();

    if(elem != ImageElement.INODE &&
       elem != ImageElement.INODE_UNDER_CONSTRUCTION)
      return;
    inInode = false;
    if(current.numBlocks < 0) {
      totalDirectories ++;
      return;
    }
    totalFiles++;
    totalBlocks += current.numBlocks;
    totalSpace += current.fileSize * current.replication;
    if(maxFileSize < current.fileSize)
      maxFileSize = current.fileSize;
    int high;
    if(current.fileSize > maxSize)
      high = distribution.length-1;
    else
      high = (int)Math.ceil((double)current.fileSize / step);

    if (high >= distribution.length) {
      high = distribution.length - 1;
    }
    distribution[high]++;
    if(totalFiles % 1000000 == 1)
      System.out.println("Files processed: " + totalFiles
          + "  Current: " + current.path);
  }

  @Override
  void visit(ImageElement element, String value) throws IOException {
    if(inInode) {
      switch(element) {
      case INODE_PATH:
        current.path = (value.equals("") ? "/" : value);
        break;
      case REPLICATION:
        current.replication = Integer.parseInt(value);
        break;
      case NUM_BYTES:
        current.fileSize += Long.parseLong(value);
        break;
      default:
        break;
      }
    }
  }

  @Override
  void visitEnclosingElement(ImageElement element) throws IOException {
    elemS.push(element);
    if(element == ImageElement.INODE ||
       element == ImageElement.INODE_UNDER_CONSTRUCTION) {
      current = new FileContext();
      inInode = true;
    }
  }

  @Override
  void visitEnclosingElement(ImageElement element,
      ImageElement key, String value) throws IOException {
    elemS.push(element);
    if(element == ImageElement.INODE ||
       element == ImageElement.INODE_UNDER_CONSTRUCTION)
      inInode = true;
    else if(element == ImageElement.BLOCKS)
      current.numBlocks = Integer.parseInt(value);
  }
}