// Dr. Berna Massingill // 2000 Jan 29 // CS1321 // CS1321 Sample Document Generator // Documented by Jeffrey D. Oldham, 2000 Jan 30. I.e., he is // responsible for misleading you with the following comments. :) // This program generates random documents to test the web search // engine. // // To use the program, type // generate wordsfile file-prefix number-of-files min-words max-words // This will create "number-of-files" files called "file-prefix00", // "file-prefix-01," ... "file-prefix". Each file // contains between "min-words" and // "max-words" randomly chosen from the file "wordsfile". // wordsfile = file containing a universe of words from which to draw // file-prefix = filename prefix for all generated files // number-of-files = number of generated files to produce // min-words = minimum number of words to place in each generated file // max-words = maxnimum number of words to place in each generated file #include #include #include #include #include #include // has pair #include typedef pair StringIntPair; // ---------------------------------------- // Print error message and exit. void errorExit(const string & msg) { cerr << msg << endl; exit(EXIT_FAILURE); } // ---------------------------------------- // Read words from filename into wordData. // Exits program if unable to open filename. void getWords(const char filename[], vector & wordData) { ifstream inStr; string temp; inStr.open(filename); if (inStr.fail()) errorExit("Unable to open input file"); while (inStr >> temp) wordData.push_back(make_pair(temp, static_cast(0))); inStr.close(); if (wordData.size() == 0) errorExit("No words in input file"); return; } // Generates 2-character string version of "num", which is assumed // to be between 0 and 99 inclusive. string num2str(const int num) { assert(num >= 0 && num < 100); string temp; temp += (static_cast (num/10 + static_cast ('0'))); temp += (static_cast (num%10 + static_cast ('0'))); return temp; } // Returns a random integer in the range [min, max]. int randInRange(const int min, const int max) { return min + static_cast(static_cast(max-min+1) * rand() / (RAND_MAX + 1.0)); } // Generates one output file. // outFileName is filename, outFileWords is number of words total, // wordsPerLine is words per line, wordData is vector of words and // frequencies. void generateOutFile(const char outFileName[], const int outFileWords, const int wordsPerLine, vector & wordData) { ofstream outStr; outStr.open(outFileName); if (outStr.fail()) errorExit("Unable to open output file"); cout << "Generating output file " << outFileName << ", size = " << outFileWords << endl; for (int i = 0; i < outFileWords; ++i) { int wordNum = randInRange(0, wordData.size() - 1); outStr << wordData[wordNum].first; ++wordData[wordNum].second; if (((i % wordsPerLine) == 0) && i > 0) outStr << endl; else outStr << " "; } outStr << endl; outStr.close(); return; } // ---------------------------------------- int main(int argc, char *argv[]) { vector< pair > wordData; string outFileName; const int wordsPerLine = 4; if (argc < 6) errorExit("Arguments: wordsfile outprefix numfiles minwords maxwords"); getWords(argv[1], wordData); string outPrefix = string(argv[2]); int numfiles = strtol(argv[3], static_cast(0), 10); if (numfiles > 100) errorExit("Maximum of 100 input files"); int minwords = strtol(argv[4], static_cast(0), 10); int maxwords = strtol(argv[5], static_cast(0), 10); for (int i = 0; i < numfiles; ++i) { outFileName = outPrefix + "." + num2str(i); generateOutFile(outFileName.c_str(), randInRange(minwords,maxwords), wordsPerLine, wordData); } cout << "\nWord frequencies:\n"; for (unsigned int i = wordData.size(); i > 0; --i) cout << wordData[i-1].first << " used " << wordData[i-1].second << " times\n"; return EXIT_SUCCESS; }