Upload
pavan2711
View
20
Download
0
Embed Size (px)
DESCRIPTION
Joins
Citation preview
Hpot-Tech
1 Joins
Create a java project JoinMap and create the following classes:
Hpot-Tech
2 Joins
package com.hp.join; // == JobBuilder import java.io.IOException;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool;
public class JobBuilder {
private final Class driverClass; private final Job job; private final int extraArgCount; private final String extrArgsUsage;
private String[] extraArgs;
public JobBuilder(Class driverClass) throws IOException { this(driverClass, 0, ""); }
public JobBuilder(Class driverClass, int extraArgCount, String extrArgsUsage) throws IOException { this.driverClass = driverClass; this.extraArgCount = extraArgCount; this.job = new Job(); this.job.setJarByClass(driverClass); this.extrArgsUsage = extrArgsUsage; }
// vv JobBuilder public static Job parseInputAndOutput(Tool tool, Configuration conf, String[] args) throws IOException {
if (args.length != 2) { printUsage(tool, " "); return null;
Hpot-Tech
3 Joins
} Job job = new Job(conf); job.setJarByClass(tool.getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job; }
public static void printUsage(Tool tool, String extraArgsUsage) { System.err.printf("Usage: %s [genericOptions] %s\n\n", tool.getClass().getSimpleName(), extraArgsUsage); GenericOptionsParser.printGenericCommandUsage(System.err); } // ^^ JobBuilder
public JobBuilder withCommandLineArgs(String... args) throws IOException { Configuration conf = job.getConfiguration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length < 2 && otherArgs.length > 3 + extraArgCount) { System.err.printf("Usage: %s [genericOptions] [-overwrite] %s\n\n", driverClass.getSimpleName(), extrArgsUsage); GenericOptionsParser.printGenericCommandUsage(System.err); System.exit(-1); } int index = 0; boolean overwrite = false; if (otherArgs[index].equals("-overwrite")) { overwrite = true; index++; } Path input = new Path(otherArgs[index++]); Path output = new Path(otherArgs[index++]);
if (index < otherArgs.length) { extraArgs = new String[otherArgs.length - index]; System.arraycopy(otherArgs, index, extraArgs, 0, otherArgs.length - index); }
if (overwrite) {
Hpot-Tech
4 Joins
output.getFileSystem(conf).delete(output, true); }
FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); return this; }
public Job build() { return job; }
public String[] getExtraArgs() { return extraArgs; } }
Hpot-Tech
5 Joins
package com.hp.join;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter;
public class JoinRecordMapper extends MapReduceBase implements Mapper { private NcdcRecordParser parser = new NcdcRecordParser();
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
parser.parse(value); output.collect(new TextPair(parser.getStationId(), "1"), value); } }
Hpot-Tech
6 Joins
package com.hp.join;
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import org.apache.hadoop.mapred.lib.MultipleInputs; import org.apache.hadoop.util.*; @SuppressWarnings("deprecation") public class JoinRecordWithStationName extends Configured implements Tool {
public static class KeyPartitioner implements Partitioner { @Override public void configure(JobConf job) {}
@Override public int getPartition(TextPair key, Text value, int numPartitions) { return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions; } }
@Override public int run(String[] args) throws Exception { if (args.length != 3) { JobBuilder.printUsage(this, " "); return -1; }
JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Join record with station name");
Path ncdcInputPath = new Path(args[0]); Path stationInputPath = new Path(args[1]); Path outputPath = new Path(args[2]);
MultipleInputs.addInputPath(conf, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class); MultipleInputs.addInputPath(conf, stationInputPath, TextInputFormat.class, JoinStationMapper.class);
Hpot-Tech
7 Joins
FileOutputFormat.setOutputPath(conf, outputPath);
conf.setPartitionerClass(KeyPartitioner.class); conf.setOutputValueGroupingComparator(TextPair.FirstComparator.class);
conf.setMapOutputKeyClass(TextPair.class);
conf.setReducerClass(JoinReducer.class);
conf.setOutputKeyClass(Text.class);
JobClient.runJob(conf); return 0; }
public static void main(String[] args) throws Exception { args = new String[3]; args[0] = "inputncdc"; args[1] = "inputstation"; args[2] = "output"+System.currentTimeMillis();
int exitCode = ToolRunner.run(new JoinRecordWithStationName(), args); System.exit(exitCode); } }
Hpot-Tech
8 Joins
package com.hp.join;
import java.io.IOException; import java.util.Iterator;
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*;
public class JoinReducer extends MapReduceBase implements Reducer {
public void reduce(TextPair key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
Text stationName = new Text(values.next()); while (values.hasNext()) { Text record = values.next(); Text outValue = new Text(stationName.toString() + "\t" + record.toString()); output.collect(key.getFirst(), outValue); } } }
Hpot-Tech
9 Joins
package com.hp.join;
import java.io.IOException;
import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*;
public class JoinStationMapper extends MapReduceBase implements Mapper { private NcdcStationMetadataParser parser = new NcdcStationMetadataParser();
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
if (parser.parse(value)) { output.collect(new TextPair(parser.getStationId(), "0"), new Text(parser.getStationName())); } } }
Hpot-Tech
10 Joins
package com.hp.join; import java.math.*; import org.apache.hadoop.io.Text;
public class MetOfficeRecordParser {
private String year; private String airTemperatureString; private int airTemperature; private boolean airTemperatureValid;
public void parse(String record) { if (record.length() < 18) { return; } year = record.substring(3, 7); if (isValidRecord(year)) { airTemperatureString = record.substring(13, 18); if (!airTemperatureString.trim().equals("---")) { BigDecimal temp = new BigDecimal(airTemperatureString.trim()); temp = temp.multiply(new BigDecimal(BigInteger.TEN)); airTemperature = temp.intValueExact(); airTemperatureValid = true; } } }
private boolean isValidRecord(String year) { try { Integer.parseInt(year); return true; } catch (NumberFormatException e) { return false; } }
public void parse(Text record) { parse(record.toString()); }
Hpot-Tech
11 Joins
public String getYear() { return year; }
public int getAirTemperature() { return airTemperature; }
public String getAirTemperatureString() { return airTemperatureString; }
public boolean isValidTemperature() { return airTemperatureValid; }
}
Hpot-Tech
12 Joins
package com.hp.join; import java.text.*; import java.util.Date;
import org.apache.hadoop.io.Text;
public class NcdcRecordParser {
private static final int MISSING_TEMPERATURE = 9999;
private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm");
private String stationId; private String observationDateString; private String year; private String airTemperatureString; private int airTemperature; private boolean airTemperatureMalformed; private String quality;
public void parse(String record) { stationId = record.substring(4, 10) + "-" + record.substring(10, 15); observationDateString = record.substring(15, 27); year = record.substring(15, 19); airTemperatureMalformed = false; // Remove leading plus sign as parseInt doesn't like them if (record.charAt(87) == '+') { airTemperatureString = record.substring(88, 92); airTemperature = Integer.parseInt(airTemperatureString); } else if (record.charAt(87) == '-') { airTemperatureString = record.substring(87, 92); airTemperature = Integer.parseInt(airTemperatureString); } else { airTemperatureMalformed = true; } airTemperature = Integer.parseInt(airTemperatureString); quality = record.substring(92, 93); }
Hpot-Tech
13 Joins
public void parse(Text record) { parse(record.toString()); }
public boolean isValidTemperature() { return !airTemperatureMalformed && airTemperature != MISSING_TEMPERATURE && quality.matches("[01459]"); }
public boolean isMalformedTemperature() { return airTemperatureMalformed; }
public boolean isMissingTemperature() { return airTemperature == MISSING_TEMPERATURE; }
public String getStationId() { return stationId; }
public Date getObservationDate() { try { System.out.println(observationDateString); return DATE_FORMAT.parse(observationDateString); } catch (ParseException e) { throw new IllegalArgumentException(e); } }
public String getYear() { return year; }
public int getYearInt() { return Integer.parseInt(year); }
public int getAirTemperature() { return airTemperature;
Hpot-Tech
14 Joins
}
public String getAirTemperatureString() { return airTemperatureString; }
public String getQuality() { return quality; }
}
Hpot-Tech
15 Joins
package com.hp.join; import java.io.*; import java.util.*; import org.apache.hadoop.io.IOUtils;
public class NcdcStationMetadata {
private Map stationIdToName = new HashMap();
public void initialize(File file) throws IOException { BufferedReader in = null; try { in = new BufferedReader(new InputStreamReader(new FileInputStream(file))); NcdcStationMetadataParser parser = new NcdcStationMetadataParser(); String line; while ((line = in.readLine()) != null) { if (parser.parse(line)) { stationIdToName.put(parser.getStationId(), parser.getStationName()); } } } finally { IOUtils.closeStream(in); } }
public String getStationName(String stationId) { String stationName = stationIdToName.get(stationId); if (stationName == null || stationName.trim().length() == 0) { return stationId; // no match: fall back to ID } return stationName; }
public Map getStationIdToNameMap() { return Collections.unmodifiableMap(stationIdToName); }
}
Hpot-Tech
16 Joins
package com.hp.join; import org.apache.hadoop.io.Text;
public class NcdcStationMetadataParser {
private String stationId; private String stationName;
public boolean parse(String record) { if (record.length() < 42) { // header return false; } String usaf = record.substring(0, 6); String wban = record.substring(7, 12); stationId = usaf + "-" + wban; stationName = record.substring(13, 42); try { Integer.parseInt(usaf); // USAF identifiers are numeric return true; } catch (NumberFormatException e) { return false; } }
public boolean parse(Text record) { return parse(record.toString()); }
public String getStationId() { return stationId; }
public String getStationName() { return stationName; }
}
Hpot-Tech
17 Joins
package com.hp.join; // cc TextPair A Writable implementation that stores a pair of Text objects // cc TextPairComparator A RawComparator for comparing TextPair byte representations // cc TextPairFirstComparator A custom RawComparator for comparing the first field of TextPair byte representations // vv TextPair import java.io.*;
import org.apache.hadoop.io.*;
public class TextPair implements WritableComparable {
private Text first; private Text second;
public TextPair() { set(new Text(), new Text()); }
public TextPair(String first, String second) { set(new Text(first), new Text(second)); }
public TextPair(Text first, Text second) { set(first, second); }
public void set(Text first, Text second) { this.first = first; this.second = second; }
public Text getFirst() { return first; }
public Text getSecond() { return second; }
@Override
Hpot-Tech
18 Joins
public void write(DataOutput out) throws IOException { first.write(out); second.write(out); }
@Override public void readFields(DataInput in) throws IOException { first.readFields(in); second.readFields(in); }
@Override public int hashCode() { return first.hashCode() * 163 + second.hashCode(); }
@Override public boolean equals(Object o) { if (o instanceof TextPair) { TextPair tp = (TextPair) o; return first.equals(tp.first) && second.equals(tp.second); } return false; }
@Override public String toString() { return first + "\t" + second; }
@Override public int compareTo(TextPair tp) { int cmp = first.compareTo(tp.first); if (cmp != 0) { return cmp; } return second.compareTo(tp.second); } // ^^ TextPair
Hpot-Tech
19 Joins
// vv TextPairComparator public static class Comparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public Comparator() { super(TextPair.class); }
@Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
try { int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1); int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2); int cmp = TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2); if (cmp != 0) { return cmp; } return TEXT_COMPARATOR.compare(b1, s1 + firstL1, l1 - firstL1, b2, s2 + firstL2, l2 - firstL2); } catch (IOException e) { throw new IllegalArgumentException(e); } } }
static { WritableComparator.define(TextPair.class, new Comparator()); } // ^^ TextPairComparator
// vv TextPairFirstComparator public static class FirstComparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
public FirstComparator() { super(TextPair.class);
Hpot-Tech
20 Joins
}
@Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
try { int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1); int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2); return TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2); } catch (IOException e) { throw new IllegalArgumentException(e); } }
@Override public int compare(WritableComparable a, WritableComparable b) { if (a instanceof TextPair && b instanceof TextPair) { return ((TextPair) a).first.compareTo(((TextPair) b).first); } return super.compare(a, b); } } // ^^ TextPairFirstComparator
// vv TextPair } // ^^ TextPair
Hpot-Tech
21 Joins
Create the following folder and copy the file:
Hpot-Tech
22 Joins
Hpot-Tech
23 Joins
Run the application:
Hpot-Tech
24 Joins
Submit the jar in cluster:
Export the jar and submit as follows:
Create the necessary input folders:
Un common the path initialization as follow:
/*args = new String[3]; args[0] = "inputncdc"; args[1] = "inputstation"; args[2] = "output"+System.currentTimeMillis();*/
#hadoop fs -mkdir incdc/
#hadoop fs -mkdir instation/
#hadoop fs -copyFromLocal /hadoop/data/sample.txt incdc/
#hadoop fs -copyFromLocal /hadoop/data/stations*.txt instation/
#hadoop jar /hadoop/hadoop/myhadoopjoin.jar com.hp.join.JoinRecordWithStationName incdc instation outputs
Hpot-Tech
25 Joins
You can view the data as follows: