1 // gavin hurley
    2 // 2009-01-15 Thursday
    3 // This program takes input from standard input and outputs a listing of
    4 // the character counts in decreasing order. Useful for cracking incredibly
    5 // simple crytosystems (monoalphabetic substitution).
    6 //
    7 // There are two parts to the program. The first simply reads from stdin and
    8 // makes a map of character to its count. The second part operates on this
    9 // map and builds 1)) a vector<int> listing just the counts and 2) a multimap
   10 // that's the inverse of the original map; counts to chars. After sorting
   11 // the vector it uses the values to look up items in the multimap. This is the
   12 // simplest way I could see to get a histogram type map. Another approach
   13 // would be to create a vector of pair<int,char> and write a function to sort
   14 // based on the int.
   15 //
   16 // TODO: decompose into functions
   17 // TODO: factor out the code into classes and/or templates so that I can
   18 //       make other types of histograms (e.g. word counts).
   19 // TODO: add command line flags to conflate case (i.e. everything converted
   20 //       to upper case)
   21 // TODO: add command line flag to ignore punctuation and whitespace
   22 
   23 #include <algorithm>
   24 #include <cctype>
   25 #include <fstream>
   26 #include <iomanip>
   27 #include <iostream>
   28 #include <map>
   29 #include <string>
   30 #include <vector>
   31 
   32 using std::endl;
   33 using std::cout;
   34 
   35 int main (int argc, char const *argv[]) {
   36   typedef std::map<char, int> CharMap;
   37   typedef CharMap::iterator CharMapIter;
   38   
   39   bool skip_punct = true;
   40   bool skip_ws = true;
   41   bool conflate_case = true;
   42   
   43   // Build main map of chars to count
   44   int total_chars = 0;
   45   CharMap cc;
   46   std::string s;
   47   while(getline(std::cin, s)) {
   48     for(size_t i = 0; i < s.length(); ++i) {
   49       char c = s[i];
   50       if (conflate_case) {
   51         c = std::toupper(c);
   52       }
   53       if ((skip_punct && std::ispunct(c))
   54           || (skip_ws && std::isspace(c))) {
   55         continue;
   56       }
   57       cc[c]++;
   58       total_chars++;
   59     }
   60   }
   61 
   62   // Use the map to make the (inverse) map and the count vector
   63   typedef std::multimap<int, char> MultMap;
   64   typedef MultMap::const_iterator MultMapIter;
   65   MultMap mm;
   66   std::vector<int> counts(cc.size());
   67   for(CharMapIter it = cc.begin(); it != cc.end(); it++) {
   68     mm.insert(std::pair<int,char>(it->second, it->first));
   69     counts.push_back(it->second);
   70   }
   71 
   72   std::sort(counts.begin(), counts.end());
   73   // Iterate the vector (backwards) and pull out the items that have that
   74   // count.
   75   typedef std::vector<int>::reverse_iterator RevIter;
   76   int previous = -1;
   77   for(RevIter it = counts.rbegin(); it != counts.rend(); ++it) {
   78     if(*it == previous) {
   79       continue;
   80     }
   81     previous = *it;
   82     for(MultMapIter iter = mm.lower_bound(*it);
   83         iter != mm.upper_bound(*it); ++iter) {
   84       float percent = float(iter->first)/float(total_chars) * 100;
   85       cout << " char: " << iter->second << " " << std::setw(4)
   86            << iter->first << " [" << std::fixed << std::setprecision(2)
   87            << percent << "%]" << endl;
   88     }
   89   }
   90   return 0;
   91 }