// Oldham, Jeffrey D. // 2000 Jan 25 // CS1321 // CS1321 Homework 3: Creating Database from Documents // This program takes a file containing a list of document names. // The documents are processed, producing a file containing: // 1. the keywords on which files can be searched // 2. for each document, its name and its keyword vector // Details: // In the first phase, all the words in all the documents are // collected. Unimportant words, e.g., very frequent and very // infrequent, are discarded. (Actually, we just discard words with // three or fewer characters.) // In the second phase, we build a vector for each document, scaling // to a unit vector and then printing to cout. // Command-line Arguments: // 1. the name of the file listing the document names // 2. (optionally) a prefix to affix to each document name // Declarations #include #include // has EXIT_SUCCESS #include #include // has tolower() and isalpha() #include #include // has inner_product #include // has sqrt() #include "types.h" #include // has assert() // We need a mapping from words to frequency counts. typedef unsigned long ulong; typedef hash_map WordMapping; // Scale a vector and print its contents. // precondition: // postcondition: components of vector scaled to unit length are printed void printUnitVector(const vector & v) { const double length = sqrt(inner_product(v.begin(), v.end(), v.begin(), 0.0)); vector::const_iterator pos; for (pos = v.begin(); pos != v.end(); ++pos) cout << (*pos) / length << " "; cout << endl; } // Debugging Code // Print a vector's contents. template void printVector(const vector & v) { copy(v.begin(), v.end(), ostream_iterator(cout, " ")); } // Print the word frequency table's contents. class printHashPair { // a helper class; // !Hairy Code! Read at your own risk! public: printHashPair(ostream & o) : out(o) {} void operator()(const pair & p) { out << p.first << " found " << p.second << " times\n"; // return out; } private: ostream & out; }; void printWFT(const WordMapping & wm, ostream& out) { for_each(wm.begin(), wm.end(), printHashPair(out)); return; } // A heuristic attempt to remove punctuation and ignore case. // precondition: // postcondition: The returned string has only lowercase letters and // only characters appearing before the first // non-letter in the argument. string canonicalizeWord(const string & word) { string result; for (string::const_iterator pos = word.begin(); pos != word.end() && isalpha(*pos); ++pos) result += tolower(*pos); return result; } int main(int argc, char *argv[]) { string documentName; // name of current document string documentPrefix; // prefix to affix to all documents if (argc < 2 || argc > 3) { cerr << argv[0] << ": file-of-documents [document-prefix]\n"; return EXIT_FAILURE; } else if (argc == 3) documentPrefix = argv[2]; // Collect all the document words. WordMapping wordFrequencies; ifstream if_filename; ifstream if_document; string documentWord, canonicalWord; if_filename.open(argv[1]); if (!if_filename) { cerr << argv[0] << ": " << argv[1] << " cannot be read.\n"; return EXIT_FAILURE; } while (if_filename >> documentName) { if_document.open(documentName.c_str()); if (!if_document) { cerr << argv[0] << ": " << documentName << " cannot be read.\n"; return EXIT_FAILURE; } // Add all the strings in the document to the word frequency list. while (if_document >> documentWord) { canonicalWord = canonicalizeWord(documentWord); if (canonicalWord.size() > 3) // omit very short words ++wordFrequencies[canonicalWord]; } if_document.close(); } #ifdef DEBUG printWFT(wordFrequencies, cout); #endif // DEBUG if_filename.close(); // Number all the important words and print all the keywords. ulong componentNumber; WordMapping::iterator pos; for (pos = wordFrequencies.begin(), componentNumber = 0; pos != wordFrequencies.end(); ++componentNumber, ++pos) { cout << (*pos).first << " "; (*pos).second = componentNumber; } cout << endl; // Construct DocVec for each document. if_filename.open(argv[1]); if (!if_filename) { cerr << argv[0] << ": " << argv[1] << " cannot be read.\n"; return EXIT_FAILURE; } while (if_filename >> documentName) { if_document.open(documentName.c_str()); if (!if_document) { cerr << argv[0] << ": " << documentName << " cannot be read.\n"; return EXIT_FAILURE; } // Add all the strings in the document to the word frequency list. vector freq(static_cast::size_type>(componentNumber)); while (if_document >> documentWord) { // 2000Feb08 JDO: Added code to canonicalize words. I guess I // forgot to add the canonicalization code here when I added it // up above. canonicalWord = canonicalizeWord(documentWord); if (canonicalWord.size() > 3) { // omit very short words assert(wordFrequencies.find(canonicalWord) != wordFrequencies.end()); ++freq[wordFrequencies[canonicalWord]]; } } // Print the vector. cout << documentPrefix << documentName << " "; printUnitVector(freq); if_document.close(); } if_filename.close(); return EXIT_SUCCESS; }