In: Computer Science
Write a MapReduce program in hadoop to find the words that occurs more than 2000 times in book.txt file. To count the occurrences of the words and filter the Filterwords in book.txt, convert all words into lower case. Also filter the digits (0-9) and punctuation. The class made to filter the word are below. Filterwords.isOneOfThem(String in) returns true if in is a Filterword.
class Filterwords {
public static String [] myFilterWordsArray = { "a", "an", "the", "am", "are", "is","at"};
public static Set myFilterWords = new HashSet(Arrays.asList(myFilterWordsArray)); public static boolean isOneOfThem(String in) { return myFilterWords.contains(in); } }
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class WordMapper extends Mapper<LongWritable,Text,
Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context
context)
throws IOException, InterruptedException {
String[] stringArr = value.toString().split("\\s+");
for (String str : stringArr) {
word.set(str);
context.write(word, one);
}
}
}
public static class CountReducer extends Reducer<Text,
IntWritable, Text, IntWritable>{
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable values, Context
context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordMapper.class);
job.setReducerClass(CountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}