In: Computer Science
We can implement a basic web search engine by building an inverted index (also known as an index) that maps each term (word) to the set of web pages in which it appears. A search query, which is a sequence of one or more keywords, can be answered by returning the web pages that contain all of the terms in a given query.
Implement a JAVA SearchEngine class that represents an inverted index.
1) SearchEngine()
Constructs an empty SearchEngine instance.
2) void index(String document)
Adds each term in the document to the inverted index. This method will be called for each web page that the client wants to add to the search engine.
3) Set<String> search(String query)
Returns the set of documents where each document contains all of the keywords in the given query ignoring any never-indexed keywords. If none of the keywords have been indexed or if the query is empty, then an empty set is returned.
// SearchEngine.java
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
public class SearchEngine {
// Map that maps each term (word) to the set of web
pages in which it appears
private Map<String, Set<String>>
word_index;
// constructor to create an empty SearchEngine
instance.
public SearchEngine()
{
// create an empty HashMap
word_index = new HashMap<String,
Set<String>>();
}
/*
* helper method to remove all punctuation from the
given word and convert
* the word to lowercase, so that comparison can be
done case-insensitively
*/
private String removePunctuation(String word)
{
word = word.toLowerCase(); //
convert word to lower case
// loop over the word
for(int
i=0;i<word.length();i++)
{
// word is not a
letter, remove the punctuation from the word
if(word.charAt(i) < 'a' || word.charAt(i) > 'z')
word = word.substring(0, i) +
word.substring(i+1);
}
return word; // return modified
word
}
/*
* method that adds each term in the document to the
inverted index.
* This method will be called for each web page that
the client wants to add to the search engine.
*/
public void index(String document)
{
// assuming words are separated by
a space
// get the array of strings from
the document
String[] words = document.split("
");
// loop over the array of
string
for(int
i=0;i<words.length;i++)
{
// remove
punctuation from ith word
String modWord =
removePunctuation(words[i]);
// if word is
found in the index
if(word_index.containsKey(modWord))
{
// get the set of documents from the index
Set<String> docs =
word_index.get(modWord);
docs.add(document); // add this document to the
set
word_index.put(modWord, docs); // update the
entry for modWord
}
else // word is
not found in the index
{
// create a new set of String
Set<String> docs = new
HashSet<String>();
docs.add(document); // add the document to the
set
word_index.put(modWord, docs); // insert word
and the corresponding set to the index
}
}
}
/*
* method that returns the set of documents where each
document contains all of the keywords
* in the given query ignoring any never-indexed
keywords.
* If none of the keywords have been indexed or if the
query is empty, then an empty set is returned.
*/
public Set<String> search(String query)
{
// create the resultant set of
string and set it to null
Set<String> documents =
null;
// assuming term in query is
separated by a space
// get the array of strings from
the input query
String[] keywords = query.split("
");
// loop over the terms
for(int
i=0;i<keywords.length;i++)
{
// get the word
after removing punctuation and converting it to lowercase
String word =
removePunctuation(keywords[i]);
// word is found
in index
if(word_index.containsKey(word))
{
// get the set of documents containing the
word
Set<String> docs =
word_index.get(word);
// documents is not initialized, set documents
to docs
if(documents == null)
documents = docs;
else // documents is initialized, set documents
to intersection of documents and docs
documents.retainAll(docs);
}
}
// if documents was not initialized
i.e either query was empty or no terms was found in search
engine
// set documents to empty hash
set
if(documents == null)
documents = new
HashSet<String>();
return documents;
}
}
//end of SearchEngine.java
// SearchEngineTester.java
import java.util.Set;
public class SearchEngineTester {
public static void main(String[] args)
{
SearchEngine engine = new
SearchEngine();
engine.index("Good Morning");
engine.index("Good
Afternoon");
Set<String> documents =
engine.search("morning day");
System.out.println(documents.toString());
documents = engine.search("good
day");
System.out.println(documents.toString());
engine.index("every day");
documents = engine.search("morning
day");
System.out.println(documents.toString());
}
}
//end of SearchEngineTester.java
Output: