/*
This exercise is in the field of bibliometric (words and text) analysis. You will be provided with a text
file, which consists of several paragraphs of English text. Your task is to write a program which will
analyse the text, and output a range of statistics about the text.
Your program should do the following:
• A list of all the words that occur, in alphabetical order. You must not print the same word twice.
• Print out the 20 most common words in the text, together with the number of instances of each word,
with the most common word at the top.
*/
//////////////////////////////////////////////////////////////////////////
// ReadTxt.cpp : .
// by Mythma
// 以标准的字母表形式列出文章中的单词,不能重复
// 并把出现频率最高的20个打印出来
// compiler: g++
// 用vc6不能编译此程序
//////////////////////////////////////////////////////////////////////////
#include <iostream>
#include <fstream.h>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
using namespace std;
const string STR_INTERVAL = "\n-----------------------------------------------------------";
map<string, int> gMap;
vector<string> gMMStr;
bool AddWordToList(string strWord)
{
if(strWord.empty())
return false;
string strTemp = strWord;
//upper to lower
for(int i = 0; i < strTemp.size(); i++)
{
strTemp[i] = tolower(strTemp[i]);
}
//remove head punctuation and number
while(strTemp[0] < 'a' || strTemp[0] > 'z')
{
if(strTemp.size() > 1)
strTemp = strTemp.substr(1, strTemp.size() - 1);
else
return false;
}
//remove tail punctuation and number
while(strTemp[strTemp.size() - 1] < 'a' || strTemp[strTemp.size() - 1] > 'z')
{
if(strTemp.size() > 1)
strTemp = strTemp.substr(0, strTemp.size() - 2);
else
return false;
}
map<string ,int>::iterator it = gMap.find(strTemp);
//add to map if exist
if(it == gMap.end())
gMap.insert(map<string, int>::value_type(strTemp, 1));
//increase if not exist
else
++ (*it).second;
return true;
}
void OutPutWordsList()
{
cout << STR_INTERVAL
<< "\n--文件中单词的总数为: "
<< gMap.size()
<< ", 按字母排列如下"
<< STR_INTERVAL << endl;
int n = 0;
for(map<string, int>::iterator it = gMap.begin(); it != gMap.end(); ++it)
{
++n;
cout.width(15);
cout.flags(ios::left);
cout << it->first.c_str();
if( 5 == n)
{
cout << endl;
n = 0;
}
}
cout << STR_INTERVAL << endl;
}
bool Cmp(const pair<string,int> &p1, const pair<string,int> &p2)
{
return p1.second > p2.second;
}
void OutPutCount()
{
vector< pair<string,int> > wd(gMap.begin(), gMap.end());
sort(wd.begin(), wd.end(), Cmp);
cout << STR_INTERVAL
<< "\n--出现频率最多的几个单词是:"
<< STR_INTERVAL;
int i = 0;
for(vector< pair<string, int> >::iterator it=wd.begin();
it != wd.end() && i < 20; ++it, ++i)
{
cout.width(15);
cout.flags(ios::left);
cout << endl
<< it->first.c_str()
<< " ---- "
<< it->second;
}
cout << STR_INTERVAL;
}
int main(int argc, char* argv[])
{
char* strPath;
if(argc == 2)
strPath = argv[1];
else
strPath = "c:\\words.txt";
//read file
ifstream inFile(strPath);
while( !inFile.eof())
{
string strWord;
inFile >> strWord;
AddWordToList(strWord);
}
OutPutWordsList();
OutPutCount();
return 0;
}