Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course

  • View
    431

  • Download
    0

Embed Size (px)

Text of Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course

  • Pig i Hive: Szybkie wprowadzenie

    Radosaw Stankiewicz Bartomiej Tartanus

  • Agenda

    Wstp -> 2 sowa o Map Reduce -> Pig -> Hive

    3

  • Wprowadzenie

    4

  • V O LUME5

  • Variety

    6

    A|123|10$ B|555|20$ Y|333|15$

    { 'typ'='A', 'id'=123, 'kwota'='10$'

    }

  • Velocity

    OLAP

    Real Time

    Batch

    Streaming Interactive analytics

    7

  • Value

    9

  • Klasyfikacja problemu Baza danych ulic Warszawy, Dane w formacie JSON,

    optymalizacja odbioru mieci jednego z usugodawcw.

    Zdarzenia z bazy transakcyjnej i kart kredytowych w celu lepszego wykrywania fraudw

    System wyszukujcy dobre oferty samochodw z wielu serwisw - web crawling, parsowanie danych, analiza trendw cen samochodw

    Centralne repozytorium skanw umw, TB danych, codziennie przybywa kilkaset nowych dokumentw

    10

  • Geneza

    za duo danych

    pady serwerw

    wolne relacyjne bazy danych

    11

  • 12

  • Architektura

    13 rdo: Hortonworks

  • Ekosystem Hadoop

    14 rdo: Hortonworks

  • 15

  • HDFS - Namenode, Datanode

    16

  • User Commands o dfs o fsck

    Administration Commands o datanode o dfsadmin o namenode

    dfs: appendToFile cat chgrp chmod chown copyFromLocal copyToLocal count cp du dus expunge get getfacl getfattr getmerge ls lsr mkdir moveFromLocal moveToLocal mv put rm rmr setfacl setfattr setrep stat tail test text touchz

    hdfs dfs -put localfile1 localfile2 /user/tmp/hadoopdir hdfs dfs -getmerge /user/hadoop/output/ localfile

    komendy

    17

  • Architektura YARN

    18

  • Map Reduce Framework

    19

  • Map Reduce Framework

    20

    M

    M

    M

    M

    R

    R

    R

    R

    R

  • Mapper

    #!/usr/bin/env python import sys for line in sys.stdin: words = line.strip().split() for word in words: print '%s\t%s' % (word, 1)

    line = Ala ma kota

    Ala 1 ma 1 kota 1

    21

  • Reducer#!/usr/bin/env python import sys current_word = None current_count = 0 word = None for line in sys.stdin: line = line.strip() word, count = line.split('\t', 1) count = int(count) if current_word == word: current_count += count else: if current_word: print '%s,%s' % (current_word, current_count) current_count = count current_word = word if current_word == word: print '%s,%s' % (current_word, current_count)

    ala 1 ala 1 bela 1 dela 1

    ala,2 bela,1 dela,1

    22

  • Uruchomienie streaming

    cat input.txt | ./mapper.py | sort | ./reducer.py

    bin/yarn jar [..]/hadoop-*streaming*.jar \ -file mapper.py -mapper ./mapper.py -file reducer.py -reducer ./reducer.py \-input /tmp/wordcount/input -output /tmp/wordcount/output

    23

  • Map Reduce w Java(input) -> map -> -> combine -> -> reduce -> (output) 1) Mapper 2) Reducer 3) run public class WordCount extends Configured implements Tool { public static class TokenizerMapper{...} public static class IntSumReducer{...} public int run(...){...}

    }

    24

  • Mapper

    public static class TokenizerMapper extends Mapper{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } public void setup(...) {...} public void cleanup(...) {...} public void run(...) {...} }

    value = Ala ma kota

    Ala,1 ma,1 kota,1

  • Reducer

    public static class IntSumReducer extends Reducer { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable values, Context context ) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } public void setup(...) {...} public void cleanup(...) {...} public void run(...) {...} }

    kota,(1,1,1,1)

    kota,4

  • Mainpublic int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new WordCount(),args); System.exit(res); }

    yarn jar wc.jar WordCount /tmp/wordcount/input /tmp/wordcount/output

  • Wprowadzenie do przetwarzania danych na

    przykadzie Pig

    28

  • Architektura Pig

    29

  • Czy warto?

    Top 5 stron odwiedzanych przez uytkownikw majcych 18 lat

  • import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.KeyValueTextInputFormat; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.jobcontrol.Job; import org.apache.hadoop.mapred.jobcontrol.JobControl; import org.apache.hadoop.mapred.lib.IdentityMapper; public class MRExample { public static class LoadPages extends MapReduceBase implements Mapper { public void map(LongWritable k, Text val, OutputCollector oc, Reporter reporter) throws IOException { // Pull the key out String line = val.toString(); int firstComma = line.indexOf(','); String key = line.substring(0, firstComma); String value = line.substring(firstComma + 1); Text outKey = new Text(key); // Prepend an index to the value so we know which file // it came from. Text outVal = new Text("1" + value); oc.collect(outKey, outVal); } }

    public static class LoadAndFilterUsers extends MapReduceBase implements Mapper { public void map(LongWritable k, Text val, OutputCollector oc, Reporter reporter) throws IOException { // Pull the key out String line = val.toString(); int firstComma = line.indexOf(','); String value = line.substring(firstComma + 1); int age = Integer.parseInt(value); if (age < 18 || age > 25) return; String key = line.substring(0, firstComma); Text outKey = new Text(key); // Prepend an index to the value so we know which file // it came from. Text outVal = new Text("2" + value); oc.collect(outKey, outVal); } } public static class Join extends MapReduceBase implements Reducer { public void reduce(Text key, Iterator iter, OutputCollector oc, Reporter reporter) throws IOException { // For each value, figure out which file it's from and store it // accordingly. List first = new ArrayList(); List second = new ArrayList(); while (iter.hasNext()) { Text t = iter.next(); String value = t.toString(); if (value.charAt(0) == '1') first.add(value.substring(1)); else second.add(value.substring(1)); reporter.setStatus("OK"); } // Do the cross product and collect the values for (String s1 : first) { for (String s2 : second) { String outval = key + "," + s1 + "," + s2; oc.collect(null, new Text(outval)); reporter.setStatus("OK"); } } } }

  • public static class LoadJoined extends MapReduceBase implements Mapper { public void map( Text k, Text val, OutputCollector oc, Reporter reporter) throws IOException { // Find the url String line = val.toString(); int firstComma = line.indexOf(','); int secondComma = line.indexOf(',', firstComma); String key = line.substring(firstComma, secondComma); // drop the rest of the record, I don't need it anymore, // just pass a 1 for the combiner/reducer to sum instead. Text outKey = new Text(key); oc.collect(outKey, new LongWritable(1L)); } } public static class ReduceUrls extends MapReduceBase implements Reducer { public void reduce( Text key, Iterator iter, OutputCollector oc, Reporter reporter) throws IOException { // Add up all the values we see long sum = 0; while (iter.hasNext()) { sum += iter.next().get(); reporter.setStatus("OK"); } oc.collect(key, new LongWritable(sum)); } }

    public static class LoadClicks extends MapReduceBase implements Mapper { public void map( WritableComparable key, Writable val, OutputCollector oc, Reporter reporter) throws IOException { oc.collect((LongWritable)val, (Text)key); } } public static class LimitClicks extends MapReduceBase implements Reducer { int count = 0; p