In: Computer Science
Our next optimization comes from using the brilliant Rabin-Karp substring matching algorithm (RK for short), invented in the eighties by two famous computer scientists, Michael Rabin and Richard Karp 3. RK checks if a given query string P appears as a substring in Y. At a high level, RK works by computing a hash for the query string, hash(P ), as well as a hash for each of the n − k + 1substrings of length k in Y, hash(Y [0...k − 1]), hash(Y [1...k]), ..., hash(Y [n − k...n − 1]). A hash function turns any arbitrary string into a b-bit hash value with the property that collision (two different strings with the same hash value) is unlikely. Therefore, by comparing hash (P) with each of the n − k + 1 hashes from Y, we can check if P appears as a substring in Y. There are many nice hash functions out there (such as MD5, SHA-1), but RK's magical ingredient is its "rolling" hash function. Specifically, given hash(Y [i...i + k − 1]), it takes only constant time instead of O(k) time to compute hash(Y [i + 1...i + k]). Now we explain how the rolling hash is computed. Let's treat each character as a digit in radix-d notation. We choose radix d = 256 since each character in the C language is represented by a single byte and we can conveniently use the byte value of the character as its digit. For example, the string 'ab' corresponds to two digits with one being 97 (the ASCII value of 'a'), and the other being 98 (the ASCII value of 'b'). The decimal value of 'ab' in radix-256 can be calculated as 256 ∗ 97 + 98 = 24930. The hash of a string P in RK is hash(P[0...k−1])=256?−1 ∗P[0]+256?−2∗P[1]+...+256∗P[k−2]+P[k−1]].
COMPLETE THE RABIN_KARP MATCH FUNCTION ONLY
Now let's see how to do a rolling calculation of the values for every substring of Y.
Let ?? = hash(Y [0...k − 1] and yi = hash(Y [i...i + k − 1]). We can compute ??+1 from ?? in constant time, by observing that
??+1 =256∗(?? −256?−1∗Y[i])+Y[i+k]
Note that we have to remember the value of 256?−1 in a variable instead of re-computing it for?? each time. Now we've seen how rolling hash works. The only fly in the ointment is that these radix-256 hash values are too huge to work with efficiently and conveniently. Therefore, we perform all the computation in modulo q, where q is chosen to be a ????? 4 ????? 5. Hash collisions are infrequent, but still possible. Therefore once we detect some ?? = hash(P ), we should compare the actual strings Y [i...i + k − 1] and P [0...k − 1] to see if they are indeed identical. Since RK speeds up substring matching to O(n) on average instead of O(n ∗ k) as in the simple algorithm. However, we still need to run RK [?? ] times for each of the [?? ] chunks of X to be matched in Y. Thus, our approximate matching algorithm using RK has an overall runtimeO ( ?? * n )
Your job: Implement the RK substring matching algorithm by completing therabin_karp_match for each chunk of X to bet matched. When calculating the hash values, you should use the given modulo arithmetic functions madd, mdel, mul
As with simple_match, our main procedure will invoke rabin_karp_match for each chunk of X to be matched. rabin_karp_match has the same interface as simple_match and should return 1 if the chunk appears as a substring in Y or 0 if otherwise.
Complete the following function:
* Match every k-character snippet of the query_doc
document
among a collection of documents doc1, doc2, ....
./rkmatch snippet_size query_doc doc1 [doc2...]
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <strings.h>
#include <assert.h>
#include <time.h>
#include <ctype.h>
#include "bloom.h"
enum algotype { SIMPLE = 0, RK, RKBATCH};
/* a large prime for RK hash (BIG_PRIME*256 does not
overflow)*/
long long BIG_PRIME = 5003943032159437;
/* constants used for printing debug information */
const int PRINT_RK_HASH = 5;
const int PRINT_BLOOM_BITS = 160;
/* modulo addition */
long long
madd(long long a, long long b)
{
return
((a+b)>BIG_PRIME?(a+b-BIG_PRIME):(a+b));
}
/* modulo substraction */
long long
mdel(long long a, long long b)
{
return ((a>b)?(a-b):(a+BIG_PRIME-b));
}
/* modulo multiplication*/
long long
mmul(long long a, long long b)
{
return ((a*b) % BIG_PRIME);
}
/* read the entire content of the file 'fname' into a
character array allocated by this procedure.
Upon return, *doc contains the address of the
character array
*doc_len contains the length of the array
*/
void
read_file(const char *fname, char **doc, int *doc_len)
{
struct stat st;
int fd;
int n = 0;
fd = open(fname, O_RDONLY);
if (fd < 0) {
perror("read_file: open ");
exit(1);
}
if (fstat(fd, &st) != 0) {
perror("read_file: fstat ");
exit(1);
}
*doc = (char *)malloc(st.st_size);
if (!(*doc)) {
fprintf(stderr, " failed to
allocate %d bytes. No memory\n", (int)st.st_size);
exit(1);
}
n = read(fd, *doc, st.st_size);
if (n < 0) {
perror("read_file: read ");
exit(1);
}else if (n != st.st_size) {
fprintf(stderr,"read_file: short
read!\n");
exit(1);
}
close(fd);
*doc_len = n;
}
/* The normalize procedure examines a character array of size
len
in ONE PASS and does the following:
1) turn all upper case letters into lower case
ones
2) turn any white-space character into a space
character and,
shrink any n>1 consecutive spaces into exactly 1
space only
Hint: use C
library function isspace()
You must do the normalization IN PLACE so that when
the procedure
returns, the character array buf contains the
normalized string and
the return value is the length of the normalized
string.
*/
int
normalize(char *buf, /* The character array containing
the string to be normalized*/
int len
/* the size of the original
character array */)
{
int start_index=0; //starting index
int new_index=0; //index returning the new
position
//for loop iterating over the characters in the
array
for
(start_index=0;start_index<len;start_index++)
{
//checks if the character is an
uppercase
if(isupper(buf[start_index]))
{
buf[new_index++]=tolower(buf[start_index]); //coverts the upper
case character to a lower case and the new index changes
position
}
else if
(!isupper(buf[start_index])&& !isspace(buf[start_index]))
//handles if its not an upper case and not a space
{
buf[new_index++]=buf[start_index];
}
//removes the consecutive
spaces
else
if(isspace(buf[start_index])){
if
((start_index>0) && (start_index < (len-1))
&& !isspace(buf[start_index-1])){
buf[new_index++]=' ';
}
}
}
//removes the trailing spaces at the end
if (isspace(buf[len-1])) //if the last character is a string
{
return new_index-1; //returns the character before the
last that is not a space
}
else{
return new_index;
}
}
/* check if a query string ps (of length k) appears
in ts (of length n) as a substring
If so, return 1. Else return 0
You may want to use the library function strncmp
*/
int
simple_match(const char *ps, /* the query string
*/
int k,
/* the length of the query
string */
const char
*ts, /* the document string (Y) */
int
n
/* the
length of the document Y */)
{
int i;
for (i=0;i<n;i++){ //for loop iterating
if (strncmp(&ts[i],ps,k)==0)
//library function that compares if the query string appears in the
other document
{
return 1; //if
it matches it will return 1
}
}
return 0; //if it doesnt, it returns 0
}
/* Check if a query string ps (of length k) appears
in ts (of length n) as a substring using the
rabin-karp algorithm
If so, return 1. Else return 0
In addition, print the first 'PRINT_RK_HASH' hash
values of ts
Example:
$ ./rkmatch -t 1 -k 20 X Y
605818861882592 812687061542252 1113263531943837
1168659952685767 4992125708617222
0.01 matched: 1 out of 148
*/
COMPLETE THIS FUNCTION ONLY
int
rabin_karp_match(const char *ps, /* the query string
*/
int k,
/* the length of the query string */
const char *ts,
/* the document string (Y) */
int n
/* the length of the document
Y */ )
{
return 0;
}
/* Initialize the bitmap for the bloom filter using
bloom_init().
Insert all m/k RK hashes of qs into the bloom filter
using bloom_add().
Then, compute each of the n-k+1 RK hashes of ts and
check if it's in the filter using bloom_query().
Use the given procedure, hash_i(i, p), to compute the
i-th bloom filter hash value for the RK value p.
Return the number of matched chunks.
Additionally, print out the first PRINT_BLOOM_BITS of
the bloom filter using the given bloom_print
after inserting m/k substrings from qs.
*/
int
rabin_karp_batchmatch(int bsz, /* size of bitmap (in bits) to be
used */
int k, /* chunk length to be matched */
const char *qs, /* query docoument (X)*/
int m, /* query document length */
const char *ts, /* to-be-matched document (Y) */
int n /* to-be-matched document length*/)
{
return 0;
}
/* Following program is a C++ implementation of Rabin Karp
Algorithm given in the CLRS book */
#include <bits/stdc++.h>
using namespace std;
// d is the number of characters in the input alphabet
#define d 256
/* pat -> pattern
txt -> text
q -> A prime number
*/
void search(char pat[], char txt[], int q)
{
int M = strlen(pat);
int N = strlen(txt);
int i, j;
int p = 0; // hash value for pattern
int t = 0; // hash value for txt
int h = 1;
// The value of h would be "pow(d, M-1)%q"
for (i = 0; i < M - 1; i++)
h = (h * d) % q;
// Calculate the hash value of pattern and
first
// window of text
for (i = 0; i < M; i++)
{
p = (d * p + pat[i]) % q;
t = (d * t + txt[i]) % q;
}
// Slide the pattern over text one by one
for (i = 0; i <= N - M; i++)
{
// Check the hash values of
current window of text
// and pattern. If the hash values
match then only
// check for characters on by
one
if ( p == t )
{
/* Check for
characters one by one */
for (j = 0; j
< M; j++)
{
if (txt[i+j] != pat[j])
break;
}
// if p == t
and pat[0...M-1] = txt[i, i+1, ...i+M-1]
if (j ==
M)
cout<<"Pattern found at index "<<
i<<endl;
}
// Calculate hash value for next
window of text: Remove
// leading digit, add trailing
digit
if ( i < N-M )
{
t = (d*(t -
txt[i]*h) + txt[i+M])%q;
// We might
get negative value of t, converting it
// to
positive
if (t <
0)
t = (t +
q);
}
}
}
/* Driver code */
int main()
{
char txt[] = "GEEKS FOR GEEKS";
char pat[] = "GEEK";
int q = 101; // A prime number
search(pat, txt, q);
return 0;
}
// This is code is contributed by rathbhupendra