In: Computer Science
Write a method that returns the 200tth most frequent word and its frequency from a text file. Use the best text reading method with a merge sort. and/or other techniques. The method signature:
public static Integer countFAST(String fileName) throws Exception {...}
The function takes the text file name.
Source Code-
#include <stdio.h>
#include <string.h>
#include <ctype.h>
# define MAX_CHARS 26
# define MAX_WORD_SIZE 30
struct TrieNode
{
bool isEnd;
unsigned frequency;
int indexMinHeap;
TrieNode* child[MAX_CHARS];
};
struct MinHeapNode
{
TrieNode* root;
unsigned frequency;
char* word;
};
struct MinHeap
{
unsigned capacity;
int count;
MinHeapNode* array;
};
TrieNode* newTrieNode()
{
TrieNode* trieNode = new TrieNode;
trieNode->isEnd = 0;
trieNode->frequency = 0;
trieNode->indexMinHeap = -1;
for( int i = 0; i < MAX_CHARS; ++i )
trieNode->child[i] = NULL;
return trieNode;
}
MinHeap* createMinHeap( int capacity )
{
MinHeap* minHeap = new MinHeap;
minHeap->capacity = capacity;
minHeap->count = 0;
minHeap->array = new MinHeapNode [ minHeap->capacity ];
return minHeap;
}
void swapMinHeapNodes ( MinHeapNode* a, MinHeapNode* b )
{
MinHeapNode temp = *a;
*a = *b;
*b = temp;
}
void minHeapify( MinHeap* minHeap, int idx )
{
int left, right, smallest;
left = 2 * idx + 1;
right = 2 * idx + 2;
smallest = idx;
if ( left < minHeap->count &&
minHeap->array[ left ]. frequency <
minHeap->array[ smallest ]. frequency
)
smallest = left;
if ( right < minHeap->count &&
minHeap->array[ right ]. frequency <
minHeap->array[ smallest ]. frequency
)
smallest = right;
if( smallest != idx )
{
minHeap->array[ smallest ]. root->indexMinHeap = idx;
minHeap->array[ idx ]. root->indexMinHeap = smallest;
swapMinHeapNodes (&minHeap->array[ smallest ], &minHeap->array[ idx ]);
minHeapify( minHeap, smallest );
}
}
void buildMinHeap( MinHeap* minHeap )
{
int n, i;
n = minHeap->count - 1;
for( i = ( n - 1 ) / 2; i >= 0; --i )
minHeapify( minHeap, i );
}
void insertInMinHeap( MinHeap* minHeap, TrieNode** root, const char* word )
{
if( (*root)->indexMinHeap != -1 )
{
++( minHeap->array[ (*root)->indexMinHeap ]. frequency );
minHeapify( minHeap, (*root)->indexMinHeap );
}
else if( minHeap->count < minHeap->capacity )
{
int count = minHeap->count;
minHeap->array[ count ]. frequency = (*root)->frequency;
minHeap->array[ count ]. word = new char [strlen( word ) + 1];
strcpy( minHeap->array[ count ]. word, word );
minHeap->array[ count ]. root = *root;
(*root)->indexMinHeap = minHeap->count;
++( minHeap->count );
buildMinHeap( minHeap );
}
else if ( (*root)->frequency > minHeap->array[0]. frequency )
{
minHeap->array[ 0 ]. root->indexMinHeap = -1;
minHeap->array[ 0 ]. root = *root;
minHeap->array[ 0 ]. root->indexMinHeap = 0;
minHeap->array[ 0 ]. frequency = (*root)->frequency;
delete [] minHeap->array[ 0 ]. word;
minHeap->array[ 0 ]. word = new char [strlen( word ) + 1];
strcpy( minHeap->array[ 0 ]. word, word );
minHeapify ( minHeap, 0 );
}
}
void insertUtil ( TrieNode** root, MinHeap* minHeap,
const char* word, const char* dupWord )
{
if ( *root == NULL )
*root = newTrieNode();
if ( *word != '\0' )
insertUtil ( &((*root)->child[ tolower( *word ) - 97 ]),
minHeap, word + 1, dupWord );
else
{
if ( (*root)->isEnd )
++( (*root)->frequency );
else
{
(*root)->isEnd = 1;
(*root)->frequency = 1;
}
insertInMinHeap( minHeap, root, dupWord );
}
}
void insertTrieAndHeap(const char *word, TrieNode** root, MinHeap* minHeap)
{
insertUtil( root, minHeap, word, word );
}
void displayMinHeap( MinHeap* minHeap )
{
int i;
for( i = 0; i < minHeap->count; ++i )
{
printf( "%s : %d\n", minHeap->array[i].word,
minHeap->array[i].frequency );
}
}
void printKMostFreq( FILE* fp, int k )
{
MinHeap* minHeap = createMinHeap( k );
TrieNode* root = NULL;
char buffer[MAX_WORD_SIZE];
while( fscanf( fp, "%s", buffer ) != EOF )
insertTrieAndHeap(buffer, &root, minHeap);
displayMinHeap( minHeap );
}
int main()
{
int k = 200; //helps in finding 200th most frequent word in a text file
FILE *fp = fopen ("file.txt", "r");
if (fp == NULL)
printf ("File doesn't exist ");
else
printKMostFreq (fp, k);
return 0;
}
Let me know if you have any doubts or if you need anything to change. If you are satisfied with the solution, please leave a +ve feedback : ) Let me know for any help with any other questions. Thank You! ===========================================================================