hadoop GenSort 源码

  • 2022-10-20
  • 浏览 (155)

haddop GenSort 代码

文件路径:/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/terasort/GenSort.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.examples.terasort;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.math.BigInteger;
import java.util.zip.Checksum;

import org.apache.hadoop.util.PureJavaCrc32;

/** 
 * A single process data generator for the terasort data. Based on gensort.c 
 * version 1.1 (3 Mar 2009) from Chris Nyberg <chris.nyberg@ordinal.com>.
 */
public class GenSort {

  /**
   * Generate a "binary" record suitable for all sort benchmarks *except* 
   * PennySort.
   */
  static void generateRecord(byte[] recBuf, Unsigned16 rand, 
                                     Unsigned16 recordNumber) {
    /* generate the 10-byte key using the high 10 bytes of the 128-bit
     * random number
     */
    for(int i=0; i < 10; ++i) {
      recBuf[i] = rand.getByte(i);
    }

    /* add 2 bytes of "break" */
    recBuf[10] = 0x00;
    recBuf[11] = 0x11;

    /* convert the 128-bit record number to 32 bits of ascii hexadecimal
     * as the next 32 bytes of the record.
     */
    for (int i = 0; i < 32; i++) {
      recBuf[12 + i] = (byte) recordNumber.getHexDigit(i);
    }

    /* add 4 bytes of "break" data */
    recBuf[44] = (byte) 0x88;
    recBuf[45] = (byte) 0x99;
    recBuf[46] = (byte) 0xAA;
    recBuf[47] = (byte) 0xBB;

    /* add 48 bytes of filler based on low 48 bits of random number */
    for(int i=0; i < 12; ++i) {
      recBuf[48+i*4] = recBuf[49+i*4] = recBuf[50+i*4] = recBuf[51+i*4] =
        (byte) rand.getHexDigit(20 + i);
    }

    /* add 4 bytes of "break" data */
    recBuf[96] = (byte) 0xCC;
    recBuf[97] = (byte) 0xDD;
    recBuf[98] = (byte) 0xEE;
    recBuf[99] = (byte) 0xFF;
  }


  private static BigInteger makeBigInteger(long x) {
    byte[] data = new byte[8];
    for(int i=0; i < 8; ++i) {
      data[i] = (byte) (x >>> (56 - 8*i));
    }
    return new BigInteger(1, data);
  }

  private static final BigInteger NINETY_FIVE = new BigInteger("95");

  /**
   * Generate an ascii record suitable for all sort benchmarks including 
   * PennySort.
   */
  static void generateAsciiRecord(byte[] recBuf, Unsigned16 rand, 
                                  Unsigned16 recordNumber) {

    /* generate the 10-byte ascii key using mostly the high 64 bits.
     */
    long temp = rand.getHigh8();
    if (temp < 0) {
      // use biginteger to avoid the negative sign problem
      BigInteger bigTemp = makeBigInteger(temp);
      recBuf[0] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue()));
      temp = bigTemp.divide(NINETY_FIVE).longValue();
    } else {
      recBuf[0] = (byte) (' ' + (temp % 95));
      temp /= 95;      
    }
    for(int i=1; i < 8; ++i) {
      recBuf[i] = (byte) (' ' + (temp % 95));
      temp /= 95;      
    }
    temp = rand.getLow8();
    if (temp < 0) {
      BigInteger bigTemp = makeBigInteger(temp);
      recBuf[8] = (byte) (' ' + (bigTemp.mod(NINETY_FIVE).longValue()));
      temp = bigTemp.divide(NINETY_FIVE).longValue();      
    } else {
      recBuf[8] = (byte) (' ' + (temp % 95));
      temp /= 95;
    }
    recBuf[9] = (byte)(' ' + (temp % 95));

    /* add 2 bytes of "break" */
    recBuf[10] = ' ';
    recBuf[11] = ' ';

    /* convert the 128-bit record number to 32 bits of ascii hexadecimal
     * as the next 32 bytes of the record.
     */
    for (int i = 0; i < 32; i++) {
      recBuf[12 + i] = (byte) recordNumber.getHexDigit(i);
    }

    /* add 2 bytes of "break" data */
    recBuf[44] = ' ';
    recBuf[45] = ' ';

    /* add 52 bytes of filler based on low 48 bits of random number */
    for(int i=0; i < 13; ++i) {
      recBuf[46+i*4] = recBuf[47+i*4] = recBuf[48+i*4] = recBuf[49+i*4] =
        (byte) rand.getHexDigit(19 + i);
    }

    /* add 2 bytes of "break" data */
    recBuf[98] = '\r';	/* nice for Windows */
    recBuf[99] = '\n';
}


  private static void usage() {
    PrintStream out = System.out;
    out.println("usage: gensort [-a] [-c] [-bSTARTING_REC_NUM] NUM_RECS FILE_NAME");
    out.println("-a        Generate ascii records required for PennySort or JouleSort.");
    out.println("          These records are also an alternative input for the other");
    out.println("          sort benchmarks.  Without this flag, binary records will be");
    out.println("          generated that contain the highest density of randomness in");
    out.println("          the 10-byte key.");
    out.println( "-c        Calculate the sum of the crc32 checksums of each of the");
    out.println("          generated records and send it to standard error.");
    out.println("-bN       Set the beginning record generated to N. By default the");
    out.println("          first record generated is record 0.");
    out.println("NUM_RECS  The number of sequential records to generate.");
    out.println("FILE_NAME The name of the file to write the records to.\n");
    out.println("Example 1 - to generate 1000000 ascii records starting at record 0 to");
    out.println("the file named \"pennyinput\":");
    out.println("    gensort -a 1000000 pennyinput\n");
    out.println("Example 2 - to generate 1000 binary records beginning with record 2000");
    out.println("to the file named \"partition2\":");
    out.println("    gensort -b2000 1000 partition2");
    System.exit(1);
  }


  public static void outputRecords(OutputStream out,
                                   boolean useAscii,
                                   Unsigned16 firstRecordNumber,
                                   Unsigned16 recordsToGenerate,
                                   Unsigned16 checksum
                                   ) throws IOException {
    byte[] row = new byte[100];
    Unsigned16 recordNumber = new Unsigned16(firstRecordNumber);
    Unsigned16 lastRecordNumber = new Unsigned16(firstRecordNumber);
    Checksum crc = new PureJavaCrc32();
    Unsigned16 tmp = new Unsigned16();
    lastRecordNumber.add(recordsToGenerate);
    Unsigned16 ONE = new Unsigned16(1);
    Unsigned16 rand = Random16.skipAhead(firstRecordNumber);
    while (!recordNumber.equals(lastRecordNumber)) {
      Random16.nextRand(rand);
      if (useAscii) {
        generateAsciiRecord(row, rand, recordNumber);
      } else {
        generateRecord(row, rand, recordNumber);
      }
      if (checksum != null) {
        crc.reset();
        crc.update(row, 0, row.length);
        tmp.set(crc.getValue());
        checksum.add(tmp);
      }
      recordNumber.add(ONE);
      out.write(row);
    }
  }
                                   
  public static void main(String[] args) throws Exception {
    Unsigned16 startingRecord = new Unsigned16();
    Unsigned16 numberOfRecords;
    OutputStream out;
    boolean useAscii = false;
    Unsigned16 checksum = null;

    int i;
    for(i=0; i < args.length; ++i) {
      String arg = args[i];
      int argLength = arg.length();
      if (argLength >= 1 && arg.charAt(0) == '-') {
        if (argLength < 2) {
          usage();
        }
        switch (arg.charAt(1)) {
        case 'a':
          useAscii = true;
          break;
        case 'b':
          startingRecord = Unsigned16.fromDecimal(arg.substring(2));
          break;
        case 'c':
          checksum = new Unsigned16();
          break;
        default:
          usage();
        }
      } else {
        break;
      }
    }
    if (args.length - i != 2) {
      usage();
    }
    numberOfRecords = Unsigned16.fromDecimal(args[i]);
    out = new FileOutputStream(args[i+1]);

    outputRecords(out, useAscii, startingRecord, numberOfRecords, checksum);
    out.close();
    if (checksum != null) {
      System.out.println(checksum);
    }
  }

}

相关信息

hadoop 源码目录

相关文章

hadoop Random16 源码

hadoop TeraChecksum 源码

hadoop TeraGen 源码

hadoop TeraInputFormat 源码

hadoop TeraOutputFormat 源码

hadoop TeraScheduler 源码

hadoop TeraSort 源码

hadoop TeraSortConfigKeys 源码

hadoop TeraValidate 源码

hadoop Unsigned16 源码

0  赞