If you can't read please download the document
View
431
Download
0
Embed Size (px)
Pig i Hive: Szybkie wprowadzenie
Radosaw Stankiewicz Bartomiej Tartanus
Agenda
Wstp -> 2 sowa o Map Reduce -> Pig -> Hive
3
Wprowadzenie
4
V O LUME5
Variety
6
A|123|10$ B|555|20$ Y|333|15$
{ 'typ'='A', 'id'=123, 'kwota'='10$'
}
Velocity
OLAP
Real Time
Batch
Streaming Interactive analytics
7
Value
9
Klasyfikacja problemu Baza danych ulic Warszawy, Dane w formacie JSON,
optymalizacja odbioru mieci jednego z usugodawcw.
Zdarzenia z bazy transakcyjnej i kart kredytowych w celu lepszego wykrywania fraudw
System wyszukujcy dobre oferty samochodw z wielu serwisw - web crawling, parsowanie danych, analiza trendw cen samochodw
Centralne repozytorium skanw umw, TB danych, codziennie przybywa kilkaset nowych dokumentw
10
Geneza
za duo danych
pady serwerw
wolne relacyjne bazy danych
11
12
Architektura
13 rdo: Hortonworks
Ekosystem Hadoop
14 rdo: Hortonworks
15
HDFS - Namenode, Datanode
16
User Commands o dfs o fsck
Administration Commands o datanode o dfsadmin o namenode
dfs: appendToFile cat chgrp chmod chown copyFromLocal copyToLocal count cp du dus expunge get getfacl getfattr getmerge ls lsr mkdir moveFromLocal moveToLocal mv put rm rmr setfacl setfattr setrep stat tail test text touchz
hdfs dfs -put localfile1 localfile2 /user/tmp/hadoopdir hdfs dfs -getmerge /user/hadoop/output/ localfile
komendy
17
Architektura YARN
18
Map Reduce Framework
19
Map Reduce Framework
20
M
M
M
M
R
R
R
R
R
Mapper
#!/usr/bin/env python import sys for line in sys.stdin: words = line.strip().split() for word in words: print '%s\t%s' % (word, 1)
line = Ala ma kota
Ala 1 ma 1 kota 1
21
Reducer#!/usr/bin/env python import sys current_word = None current_count = 0 word = None for line in sys.stdin: line = line.strip() word, count = line.split('\t', 1) count = int(count) if current_word == word: current_count += count else: if current_word: print '%s,%s' % (current_word, current_count) current_count = count current_word = word if current_word == word: print '%s,%s' % (current_word, current_count)
ala 1 ala 1 bela 1 dela 1
ala,2 bela,1 dela,1
22
Uruchomienie streaming
cat input.txt | ./mapper.py | sort | ./reducer.py
bin/yarn jar [..]/hadoop-*streaming*.jar \ -file mapper.py -mapper ./mapper.py -file reducer.py -reducer ./reducer.py \-input /tmp/wordcount/input -output /tmp/wordcount/output
23
Map Reduce w Java(input) -> map -> -> combine -> -> reduce -> (output) 1) Mapper 2) Reducer 3) run public class WordCount extends Configured implements Tool { public static class TokenizerMapper{...} public static class IntSumReducer{...} public int run(...){...}
}
24
Mapper
public static class TokenizerMapper extends Mapper{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } public void setup(...) {...} public void cleanup(...) {...} public void run(...) {...} }
value = Ala ma kota
Ala,1 ma,1 kota,1
Reducer
public static class IntSumReducer extends Reducer { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable values, Context context ) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } public void setup(...) {...} public void cleanup(...) {...} public void run(...) {...} }
kota,(1,1,1,1)
kota,4
Mainpublic int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new WordCount(),args); System.exit(res); }
yarn jar wc.jar WordCount /tmp/wordcount/input /tmp/wordcount/output
Wprowadzenie do przetwarzania danych na
przykadzie Pig
28
Architektura Pig
29
Czy warto?
Top 5 stron odwiedzanych przez uytkownikw majcych 18 lat
import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.KeyValueTextInputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.jobcontrol.Job; import org.apache.hadoop.mapred.jobcontrol.JobControl; import org.apache.hadoop.mapred.lib.IdentityMapper; public class MRExample { public static class LoadPages extends MapReduceBase implements Mapper { public void map(LongWritable k, Text val, OutputCollector oc, Reporter reporter) throws IOException { // Pull the key out String line = val.toString(); int firstComma = line.indexOf(','); String key = line.substring(0, firstComma); String value = line.substring(firstComma + 1); Text outKey = new Text(key); // Prepend an index to the value so we know which file // it came from. Text outVal = new Text("1" + value); oc.collect(outKey, outVal); } }
public static class LoadAndFilterUsers extends MapReduceBase implements Mapper { public void map(LongWritable k, Text val, OutputCollector oc, Reporter reporter) throws IOException { // Pull the key out String line = val.toString(); int firstComma = line.indexOf(','); String value = line.substring(firstComma + 1); int age = Integer.parseInt(value); if (age < 18 || age > 25) return; String key = line.substring(0, firstComma); Text outKey = new Text(key); // Prepend an index to the value so we know which file // it came from. Text outVal = new Text("2" + value); oc.collect(outKey, outVal); } } public static class Join extends MapReduceBase implements Reducer { public void reduce(Text key, Iterator iter, OutputCollector oc, Reporter reporter) throws IOException { // For each value, figure out which file it's from and store it // accordingly. List first = new ArrayList(); List second = new ArrayList(); while (iter.hasNext()) { Text t = iter.next(); String value = t.toString(); if (value.charAt(0) == '1') first.add(value.substring(1)); else second.add(value.substring(1)); reporter.setStatus("OK"); } // Do the cross product and collect the values for (String s1 : first) { for (String s2 : second) { String outval = key + "," + s1 + "," + s2; oc.collect(null, new Text(outval)); reporter.setStatus("OK"); } } } }
public static class LoadJoined extends MapReduceBase implements Mapper { public void map( Text k, Text val, OutputCollector oc, Reporter reporter) throws IOException { // Find the url String line = val.toString(); int firstComma = line.indexOf(','); int secondComma = line.indexOf(',', firstComma); String key = line.substring(firstComma, secondComma); // drop the rest of the record, I don't need it anymore, // just pass a 1 for the combiner/reducer to sum instead. Text outKey = new Text(key); oc.collect(outKey, new LongWritable(1L)); } } public static class ReduceUrls extends MapReduceBase implements Reducer { public void reduce( Text key, Iterator iter, OutputCollector oc, Reporter reporter) throws IOException { // Add up all the values we see long sum = 0; while (iter.hasNext()) { sum += iter.next().get(); reporter.setStatus("OK"); } oc.collect(key, new LongWritable(sum)); } }
public static class LoadClicks extends MapReduceBase implements Mapper { public void map( WritableComparable key, Writable val, OutputCollector oc, Reporter reporter) throws IOException { oc.collect((LongWritable)val, (Text)key); } } public static class LimitClicks extends MapReduceBase implements Reducer { int count = 0; p