In: Computer Science
Instructions:
1. Write a MapReduce program to find the frequency of each letter,
case insensitive, in some given input. For example, "The quick
brown fox jumps over the lazy dog" as input should generate the
following (letter,count) pairs: (T, 2), (H, 1), (E, 3), etc.
2. Test your program against the 3 attached input files: HadoopFile0.txt, HadoopFile1.txt, and HadoopFile2.txt.
3. The input and output must be read/written from/into HDFS.
4. Please submit only the Java source file(s) on .
5. I've attached the WordCount.java as a sample
MapReduce program. You might find it useful.
WordCount.java ::
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCount
{
public static void main(String[] args)
throws Exception {
if (args.length != 2) {
System.err.println("Usage: WordCount <input path> <output
path>");
System.exit(-1); }
Job job = Job.getInstance(); job.setJarByClass(WordCount.class);
job.setJobName("Word Count");
//job.setNumReduceTasks(2); FileInputFormat.addInputPath(job, new
Path(args[0])); FileOutputFormat.setOutputPath(job, new
Path(args[1])); job.setMapperClass(WordMapper.class);
job.setReducerClass(WordReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class WordMapper
extends Mapper<LongWritable, Text, Text, IntWritable> {
public void map(LongWritable key, Text value, Context
context)
throws IOException, InterruptedException {
String line = value.toString().toLowerCase(); String[] tokens =
line.split("\\W+");
for (String token : tokens) {
if (token.length() > 0)
context.write(new Text(token), new IntWritable(1));
}
}
}
public static class WordReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
}
Hadoopfile0.txt::
Hadoop is the Elephant King!
A yellow and elegant thing.
He never forgets
Useful data, or lets
An extraneous element cling!
Hadoopfile1.txt::
A wonderful king is Hadoop.
The elephant plays well with Sqoop.
But what helps him to thrive
Are Impala, and Hive,
And HDFS in the group.
Hadoopfile2.txt::
Hadoop is an elegant fellow.
An elephant gentle and mellow.
He never gets mad, Or does anything bad,
Because, at his core, he is yellow.
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCount { // Map function public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // Splitting the line on spaces String[] stringArr = value.toString().split("\\s+"); for (String str : stringArr) { word.set(str); context.write(word, new IntWritable(1)); } } } // Reduce function public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception{ Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "WC"); job.setJarByClass(WordCount.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }