Tutorial Join

Hpot-Tech

1 Joins

Create a java project JoinMap and create the following classes:

Hpot-Tech

2 Joins

package com.hp.join; // == JobBuilder import java.io.IOException;

import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool;

public class JobBuilder {

private final Class driverClass; private final Job job; private final int extraArgCount; private final String extrArgsUsage;

private String[] extraArgs;

public JobBuilder(Class driverClass) throws IOException { this(driverClass, 0, ""); }

public JobBuilder(Class driverClass, int extraArgCount, String extrArgsUsage) throws IOException { this.driverClass = driverClass; this.extraArgCount = extraArgCount; this.job = new Job(); this.job.setJarByClass(driverClass); this.extrArgsUsage = extrArgsUsage; }

// vv JobBuilder public static Job parseInputAndOutput(Tool tool, Configuration conf, String[] args) throws IOException {

if (args.length != 2) { printUsage(tool, " "); return null;

Hpot-Tech

3 Joins

} Job job = new Job(conf); job.setJarByClass(tool.getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job; }

public static void printUsage(Tool tool, String extraArgsUsage) { System.err.printf("Usage: %s [genericOptions] %s\n\n", tool.getClass().getSimpleName(), extraArgsUsage); GenericOptionsParser.printGenericCommandUsage(System.err); } // ^^ JobBuilder

public JobBuilder withCommandLineArgs(String... args) throws IOException { Configuration conf = job.getConfiguration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length < 2 && otherArgs.length > 3 + extraArgCount) { System.err.printf("Usage: %s [genericOptions] [-overwrite] %s\n\n", driverClass.getSimpleName(), extrArgsUsage); GenericOptionsParser.printGenericCommandUsage(System.err); System.exit(-1); } int index = 0; boolean overwrite = false; if (otherArgs[index].equals("-overwrite")) { overwrite = true; index++; } Path input = new Path(otherArgs[index++]); Path output = new Path(otherArgs[index++]);

if (index < otherArgs.length) { extraArgs = new String[otherArgs.length - index]; System.arraycopy(otherArgs, index, extraArgs, 0, otherArgs.length - index); }

if (overwrite) {

Hpot-Tech

4 Joins

output.getFileSystem(conf).delete(output, true); }

FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); return this; }

public Job build() { return job; }

public String[] getExtraArgs() { return extraArgs; } }

Hpot-Tech

5 Joins

package com.hp.join;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter;

public class JoinRecordMapper extends MapReduceBase implements Mapper { private NcdcRecordParser parser = new NcdcRecordParser();

public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {

parser.parse(value); output.collect(new TextPair(parser.getStationId(), "1"), value); } }

Hpot-Tech

6 Joins


import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.mapred.lib.MultipleInputs; import org.apache.hadoop.util.*; @SuppressWarnings("deprecation") public class JoinRecordWithStationName extends Configured implements Tool {

public static class KeyPartitioner implements Partitioner { @Override public void configure(JobConf job) {}

@Override public int getPartition(TextPair key, Text value, int numPartitions) { return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions; } }

@Override public int run(String[] args) throws Exception { if (args.length != 3) { JobBuilder.printUsage(this, " "); return -1; }

JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Join record with station name");

Path ncdcInputPath = new Path(args[0]); Path stationInputPath = new Path(args[1]); Path outputPath = new Path(args[2]);

MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class); MultipleInputs.addInputPath(conf, stationInputPath, TextInputFormat.class, JoinStationMapper.class);

Hpot-Tech

7 Joins

FileOutputFormat.setOutputPath(conf, outputPath);

conf.setPartitionerClass(KeyPartitioner.class); conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);

conf.setMapOutputKeyClass(TextPair.class);

conf.setReducerClass(JoinReducer.class);

conf.setOutputKeyClass(Text.class);

JobClient.runJob(conf); return 0; }

public static void main(String[] args) throws Exception { args = new String[3]; args[0] = "inputncdc"; args[1] = "inputstation"; args[2] = "output"+System.currentTimeMillis();

int exitCode = ToolRunner.run(new JoinRecordWithStationName(), args); System.exit(exitCode); } }

Hpot-Tech

8 Joins


import java.io.IOException; import java.util.Iterator;

import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*;

public class JoinReducer extends MapReduceBase implements Reducer {

public void reduce(TextPair key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {

Text stationName = new Text(values.next()); while (values.hasNext()) { Text record = values.next(); Text outValue = new Text(stationName.toString() + "\t" + record.toString()); output.collect(key.getFirst(), outValue); } } }

Hpot-Tech

9 Joins


import java.io.IOException;

import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*;

public class JoinStationMapper extends MapReduceBase implements Mapper { private NcdcStationMetadataParser parser = new NcdcStationMetadataParser();

public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {

if (parser.parse(value)) { output.collect(new TextPair(parser.getStationId(), "0"), new Text(parser.getStationName())); } } }

Hpot-Tech

10 Joins

package com.hp.join; import java.math.*; import org.apache.hadoop.io.Text;

public class MetOfficeRecordParser {

private String year; private String airTemperatureString; private int airTemperature; private boolean airTemperatureValid;

public void parse(String record) { if (record.length() < 18) { return; } year = record.substring(3, 7); if (isValidRecord(year)) { airTemperatureString = record.substring(13, 18); if (!airTemperatureString.trim().equals("---")) { BigDecimal temp = new BigDecimal(airTemperatureString.trim()); temp = temp.multiply(new BigDecimal(BigInteger.TEN)); airTemperature = temp.intValueExact(); airTemperatureValid = true; } } }

private boolean isValidRecord(String year) { try { Integer.parseInt(year); return true; } catch (NumberFormatException e) { return false; } }

public void parse(Text record) { parse(record.toString()); }

Hpot-Tech

11 Joins

public String getYear() { return year; }

public int getAirTemperature() { return airTemperature; }

public String getAirTemperatureString() { return airTemperatureString; }

public boolean isValidTemperature() { return airTemperatureValid; }

}

Hpot-Tech

12 Joins

package com.hp.join; import java.text.*; import java.util.Date;

import org.apache.hadoop.io.Text;

public class NcdcRecordParser {

private static final int MISSING_TEMPERATURE = 9999;

private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm");

private String stationId; private String observationDateString; private String year; private String airTemperatureString; private int airTemperature; private boolean airTemperatureMalformed; private String quality;

public void parse(String record) { stationId = record.substring(4, 10) + "-" + record.substring(10, 15); observationDateString = record.substring(15, 27); year = record.substring(15, 19); airTemperatureMalformed = false; // Remove leading plus sign as parseInt doesn't like them if (record.charAt(87) == '+') { airTemperatureString = record.substring(88, 92); airTemperature = Integer.parseInt(airTemperatureString); } else if (record.charAt(87) == '-') { airTemperatureString = record.substring(87, 92); airTemperature = Integer.parseInt(airTemperatureString); } else { airTemperatureMalformed = true; } airTemperature = Integer.parseInt(airTemperatureString); quality = record.substring(92, 93); }

Hpot-Tech

13 Joins

public void parse(Text record) { parse(record.toString()); }

public boolean isValidTemperature() { return !airTemperatureMalformed && airTemperature != MISSING_TEMPERATURE && quality.matches("[01459]"); }

public boolean isMalformedTemperature() { return airTemperatureMalformed; }

public boolean isMissingTemperature() { return airTemperature == MISSING_TEMPERATURE; }

public String getStationId() { return stationId; }

public Date getObservationDate() { try { System.out.println(observationDateString); return DATE_FORMAT.parse(observationDateString); } catch (ParseException e) { throw new IllegalArgumentException(e); } }

public String getYear() { return year; }

public int getYearInt() { return Integer.parseInt(year); }

public int getAirTemperature() { return airTemperature;

Hpot-Tech

14 Joins

}

public String getAirTemperatureString() { return airTemperatureString; }

public String getQuality() { return quality; }

}

Hpot-Tech

15 Joins

package com.hp.join; import java.io.*; import java.util.*; import org.apache.hadoop.io.IOUtils;

public class NcdcStationMetadata {

private Map stationIdToName = new HashMap();

public void initialize(File file) throws IOException { BufferedReader in = null; try { in = new BufferedReader(new InputStreamReader(new FileInputStream(file))); NcdcStationMetadataParser parser = new NcdcStationMetadataParser(); String line; while ((line = in.readLine()) != null) { if (parser.parse(line)) { stationIdToName.put(parser.getStationId(), parser.getStationName()); } } } finally { IOUtils.closeStream(in); } }

public String getStationName(String stationId) { String stationName = stationIdToName.get(stationId); if (stationName == null || stationName.trim().length() == 0) { return stationId; // no match: fall back to ID } return stationName; }

public Map getStationIdToNameMap() { return Collections.unmodifiableMap(stationIdToName); }

}

Hpot-Tech

16 Joins

package com.hp.join; import org.apache.hadoop.io.Text;

public class NcdcStationMetadataParser {

private String stationId; private String stationName;

public boolean parse(String record) { if (record.length() < 42) { // header return false; } String usaf = record.substring(0, 6); String wban = record.substring(7, 12); stationId = usaf + "-" + wban; stationName = record.substring(13, 42); try { Integer.parseInt(usaf); // USAF identifiers are numeric return true; } catch (NumberFormatException e) { return false; } }

public boolean parse(Text record) { return parse(record.toString()); }

public String getStationId() { return stationId; }

public String getStationName() { return stationName; }

}

Hpot-Tech

17 Joins

package com.hp.join; // cc TextPair A Writable implementation that stores a pair of Text objects // cc TextPairComparator A RawComparator for comparing TextPair byte representations // cc TextPairFirstComparator A custom RawComparator for comparing the first field of TextPair byte representations // vv TextPair import java.io.*;

import org.apache.hadoop.io.*;

public class TextPair implements WritableComparable {

private Text first; private Text second;

public TextPair() { set(new Text(), new Text()); }

public TextPair(String first, String second) { set(new Text(first), new Text(second)); }

public TextPair(Text first, Text second) { set(first, second); }

public void set(Text first, Text second) { this.first = first; this.second = second; }

public Text getFirst() { return first; }

public Text getSecond() { return second; }

@Override

Hpot-Tech

18 Joins

public void write(DataOutput out) throws IOException { first.write(out); second.write(out); }

@Override public void readFields(DataInput in) throws IOException { first.readFields(in); second.readFields(in); }

@Override public int hashCode() { return first.hashCode() * 163 + second.hashCode(); }

@Override public boolean equals(Object o) { if (o instanceof TextPair) { TextPair tp = (TextPair) o; return first.equals(tp.first) && second.equals(tp.second); } return false; }

@Override public String toString() { return first + "\t" + second; }

@Override public int compareTo(TextPair tp) { int cmp = first.compareTo(tp.first); if (cmp != 0) { return cmp; } return second.compareTo(tp.second); } // ^^ TextPair

Hpot-Tech

19 Joins

// vv TextPairComparator public static class Comparator extends WritableComparator {

private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();

public Comparator() { super(TextPair.class); }

@Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {

try { int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1); int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2); int cmp = TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2); if (cmp != 0) { return cmp; } return TEXT_COMPARATOR.compare(b1, s1 + firstL1, l1 - firstL1, b2, s2 + firstL2, l2 - firstL2); } catch (IOException e) { throw new IllegalArgumentException(e); } } }

static { WritableComparator.define(TextPair.class, new Comparator()); } // ^^ TextPairComparator

// vv TextPairFirstComparator public static class FirstComparator extends WritableComparator {

private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();

public FirstComparator() { super(TextPair.class);

Hpot-Tech

20 Joins

}

@Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {

try { int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1); int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2); return TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2); } catch (IOException e) { throw new IllegalArgumentException(e); } }

@Override public int compare(WritableComparable a, WritableComparable b) { if (a instanceof TextPair && b instanceof TextPair) { return ((TextPair) a).first.compareTo(((TextPair) b).first); } return super.compare(a, b); } } // ^^ TextPairFirstComparator

// vv TextPair } // ^^ TextPair

Hpot-Tech

21 Joins

Create the following folder and copy the file:

Hpot-Tech

22 Joins

Hpot-Tech

23 Joins

Run the application:

Hpot-Tech

24 Joins

Submit the jar in cluster:

Export the jar and submit as follows:

Create the necessary input folders:

Un common the path initialization as follow:

/*args = new String[3]; args[0] = "inputncdc"; args[1] = "inputstation"; args[2] = "output"+System.currentTimeMillis();*/

#hadoop fs -mkdir incdc/

#hadoop fs -mkdir instation/

#hadoop fs -copyFromLocal /hadoop/data/sample.txt incdc/

#hadoop fs -copyFromLocal /hadoop/data/stations*.txt instation/

#hadoop jar /hadoop/hadoop/myhadoopjoin.jar com.hp.join.JoinRecordWithStationName incdc instation outputs

Hpot-Tech

25 Joins

You can view the data as follows:

Documents

Tutorial Join