In: Computer Science
c++ "Programming For Big Data [
DvcScheduleV2.cpp and
StaticArray.h and/or
DynamicArray.h ]
Assignment 5's runtime was too slow -- a couple of minutes or so.
It's because of the duplicate-checking, with over 4 billion
compares.
Rewrite the duplicate-checking logic from Assignment 5, using a technique from "Techniques For Big Data, Reading" to do fewer compares, and come up with the exact same results as Assignment 5.
You may use your StaticArray.h from Assignment 3 and/or your DynamicArray.h from assignments 4, but you may not use any STL containers. Submit the H file(s) you use in your solution, even if there are no changes since your previous work. Your project will be compiled for grading using the default stack memory size of 1MB.
Progress Bar
Since this version is supposed to be fast, there is no longer a need for a progress bar. Include one if you wish, or you may leave it out -- your choice. But if you do have a progress bar, do remember to "flush"...
You should get the same result as V1 and should run an order of magnitude faster."
For some reason I'm getting different results from my first version of the programs and the program I've written for this assignment.
Code for the prior assignment :
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
#include <cstring>
#include "DynamicArray.h"
struct Class
{
string code;
int count;
};
int main()
{
DynamicArray <Class> sub;
DynamicArray <string> sem;
DynamicArray <string> sec;
int totalSubjects = 0;
int dup = 0;
int total = 0;
int counter = 0;
bool duplicate;
bool stored;
//for parsing inputfile
char* token;
char buf[1000];
const char* const tab = "\t";
//open input file
ifstream fin;
fin.open("dvc-schedule.txt");
if (!fin.good())
cout << "I/O error. File can't be found!\n";
//read the input file
while (fin.good())
{
//progress bar
if(counter % 1000 == 0)
cout << '.'; cout.flush();
duplicate = false;
stored = false;
string line;
getline(fin, line);
total++; //total lines processed
strcpy(buf, line.c_str());
if (buf[0] == 0) continue; // skip blank lines
//parse the line
const string term(token = strtok(buf, tab));
const string section(token = strtok(0, tab));
const string course((token = strtok(0, tab)) ? token : "");
const string instructor((token = strtok(0, tab)) ? token :
"");
const string whenWhere((token = strtok(0, tab)) ? token :
"");
if (course.find('-') == string::npos) continue;
const string code(course.begin(), course.begin() +
course.find('-'));
//check for duplicates
for(int i = 0; i < counter; i++)
{
if(sem[i] == term && sec[i] == section)
{
dup++;
duplicate = true;
break;
}
}
if(duplicate == true)
continue;
sem[counter] = term;
sec[counter] = section;
counter++;
for(int i = 0; i < totalSubjects; i++)
{
if (sub[i].code == code)
{
sub[i].count++;
stored = true;
break;
}
}
if(stored == true)
continue;
Class y;
y.code = code;
y.count = 1;
sub[totalSubjects] = y;
totalSubjects++;
}
fin.close();
cout << endl;
//sort
for (int i = 0; i < totalSubjects; i++)
for (int j = i + 1; j < totalSubjects; j++)
if (sub[j].code < sub[i].code)
swap(sub[j], sub[i]);
//output
for(int i = 0; i < totalSubjects; i++)
{
cout << sub[i].code << ", " << sub[i].count
<< " section" << endl;
}
cout << "Total duplication: " << dup <<
endl;
cout << "Total counts: " << total << endl;
cout << "Total subjects: " << totalSubjects <<
endl;
}
Code for this assignment:
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <fstream>
#include <string>
using namespace std;
#include <cstring>
#include "DynamicArray.h"
struct SectionsForTerm
{
string term;
int numberOfSectionsSeen;
DynamicArray<string> seenSectionNumbers;
};
struct Class
{
string code;
int count;
};
int main()
{
int numberOfTermsSeen = 0;
DynamicArray <SectionsForTerm> alreadySeen;
DynamicArray <Class> sub;
int totalSubjects = 0;
int dup = 0;
int total = 0;
int counter = 0;
bool match;
bool duplicate;
bool stored;
//for parsing inputfile
char* token;
char buf[1000];
const char* const tab = "\t";
//open input file
ifstream fin;
fin.open("dvc-schedule.txt");
if (!fin.good())
cout << "I/O error. File can't be found!\n";
//read the input file
while (fin.good())
{
//check for false
match = false;
duplicate = false;
stored = false;
//read lines
string line;
getline(fin, line);
total++; //total lines processed
strcpy(buf, line.c_str());
if (buf[0] == 0) continue; // skip blank lines
//parse the line
const string term(token = strtok(buf, tab));
const string section(token = strtok(0, tab));
const string course((token = strtok(0, tab)) ? token : "");
const string instructor((token = strtok(0, tab)) ? token :
"");
const string whenWhere((token = strtok(0, tab)) ? token :
"");
if (course.find('-') == string::npos) continue;
const string code(course.begin(), course.begin() +
course.find('-'));
//check for duplicates
int i;
for(int i = 0; i < numberOfTermsSeen; i++)
{
if(alreadySeen[i].term == term)
{
match = true;
break;
}
}
if(match == true)
{
for(int j = 0; j < alreadySeen[i].numberOfSectionsSeen;
j++)
if (alreadySeen[i].seenSectionNumbers[j]== section)
{
duplicate = true;
dup++;
break;
}
if (duplicate == true)
continue;
else
{
alreadySeen[i].seenSectionNumbers[alreadySeen[i].numberOfSectionsSeen]=
section;
alreadySeen[i].numberOfSectionsSeen++;
}
}
else
{
alreadySeen[numberOfTermsSeen].term = term;
alreadySeen[i].numberOfSectionsSeen = 1;
numberOfTermsSeen++;
}
// check for same section
for(i = 0; i < totalSubjects; i++)
{
if(sub[i].code == code)
{
stored = true;
break;
}
}
if(stored == true)
{
sub[i].count++;
}
else
{
sub[totalSubjects].code = code;
sub[totalSubjects].count = 1;
totalSubjects++;
}
counter++;
}
fin.close();
cout << endl;
//sort
for (int i = 0; i < totalSubjects; i++)
for (int j = i + 1; j < totalSubjects; j++)
if (sub[j].code < sub[i].code)
swap(sub[j], sub[i]);
//output
for(int i = 0; i < totalSubjects; i++)
{
cout << sub[i].code << ", " << sub[i].count
<< " section" << endl;
}
cout << "Total duplication: " << dup <<
endl;
cout << "Total counts: " << total << endl;
cout << "Total subjects: " << totalSubjects <<
endl;
}
*When I compare the results of both programs the subjects and total are the same but the total duplication number is significantly off. Also the amount of sections per course is off? I'm not really sure how, why, or how to fix it?
Before answering your question i believe you are comfortable with dry running.
I am saying this, because in all my industrial experience biggest mistake developers make is they directly jump into the logic part, believe me your question is 90% solved if you can do so witj pen and paper.
Now, i tried both of your code with pen and paper, with my own data, it was small though, but i got an insight about your codes
In your first assignment and secon too, your logic about duplicate terms is completely fine and i appreciate your attemt in deciding the invariant about findind duplicate terms efficiently.
Now, what i believe your codes are getting diff values are due to counter, no of subjects in 1st code
And nooftermsseen in the second, try to matchup with that part, because that is the only way your code can lead to loopholes?
Now some good debugging skills for your future projects too.
Always try to debug your logic first on paper, not your code directly.
Secondly, always use print statements intuitively they are best friends of a developer.
Hope you will learn from this.
Happy learning ??