When to use MapReduce

Apache Hadoop

Hadoop installation



Use PHP as a word counter

wget http://www.gutenberg.org/cache ... 1.txt
hadoop dfs -mkdir wordcount
#! /usr/bin/php <? php // iterate through lines while($line = fgets(STDIN)){ // remove leading and trailing $line = ltrim($line); $line = rtrim($line); // split the line in words $words = preg_split('/\s/', $line, -1, PREG_SPLIT_NO_EMPTY); // iterate through words foreach( $words as $key ) { // print word (key) to standard output // the output will be used in the // reduce (reducer.php) step // word (key) tab-delimited wordcount (1) printf("%s\t%d\n", $key, 1); }}? >Copy the code

#! /usr/bin/php <? php $last_key = NULL; $running_total = 0; // iterate through lines while($line = fgets(STDIN)) { // remove leading and trailing $line = ltrim($line); $line = rtrim($line); // split line into key and count list($key,$count) = explode("\t", $line); // this if else structure works because // hadoop sorts the mapper output by it keys // before sending it to the reducer  // if the last key retrieved is the same // as the current key that have been received if ($last_key === $key) { // increase running total of the key $running_total += $count; } else { if ($last_key ! = NULL) // output previous key and its running total printf("%s\t%d\n", $last_key, $running_total); // reset last key and running total // by assigning the new key and its value $last_key = $key; $running_total = $count; }}? >Copy the code

head -n1000 pg2701.txt | ./mapper.php | sort | ./reducer.php
Hadoop jar/usr/hadoop / 2.5.1 / libexec/lib/hadoop - streaming - 2.5.1. Jar \ - mapper ". / mapper. PHP "- reducer". / reducer. PHP" -input "hello/mobydick.txt" -output "hello/result"Copy the code

hdfs dfs -cat hello/result/part-00000

Calculate the average annual gold price

wget https://raw.githubusercontent. ... a.csv

hadoop dfs -mkdir goldpriceCopy the code

hadoop dfs -copyFromLocal ./data.csv goldprice/data.csv
#! /usr/bin/php <? php // iterate through lines while($line = fgets(STDIN)){ // remove leading and trailing $line = ltrim($line); $line = rtrim($line); // regular expression to capture year and gold value preg_match("/^(.*?) \ - (? :.*),(.*)$/", $line, $matches); if ($matches) { // key: year, value: gold price printf("%s\t%.3f\n", $matches[1], $matches[2]); }}? >Copy the code

#! /usr/bin/php <? php $last_key = NULL; $running_total = 0; $running_average = 0; $number_of_items = 0; // iterate through lines while($line = fgets(STDIN)) { // remove leading and trailing $line = ltrim($line); $line = rtrim($line); // split line into key and count list($key,$count) = explode("\t", $line); // if the last key retrieved is the same // as the current key that have been received if ($last_key === $key) { // increase number of items $number_of_items++; // increase running total of the key $running_total += $count; // (re)calculate average for that key $running_average = $running_total / $number_of_items; } else { if ($last_key ! = NULL) // output previous key and its running average printf("%s\t%.4f\n", $last_key, $running_average); // reset key, running total, running average // and number of items $last_key = $key; $number_of_items = 1; $running_total = $count; $running_average = $count; } } if ($last_key ! = NULL) // output previous key and its running average printf("%s\t%.3f\n", $last_key, $running_average); ? >Copy the code

head -n1000 data.csv | ./mapper.php | sort | ./reducer.php
Hadoop jar/usr/hadoop / 2.5.1 / libexec/lib/hadoop - streaming - 2.5.1. Jar \ - mapper ". / mapper. PHP "- reducer". / reducer. PHP" -input "goldprice/data.csv" -output "goldprice/result"Copy the code
hdfs dfs -cat goldprice/result/part-00000
Bonus: Generate charts

hdfs dfs -get goldprice/result/part-00000 gold.dat
# Gnuplot script file for generating gold prices
set terminal png
set output "chart.jpg"
set style data lines
set nokey
set grid
set title "Gold prices"
set xlabel "Year"
set ylabel "Price"
plot "gold.dat"Copy the code
gnuplot gold.plot
