赞
踩
Write a program for displaying the ten most frequent words in a file such that your program should be efficient in all complexity measures.
Requirement: C/C++.
Submission: Source code and a document to describe your solution.
Source:
思路:用hashmap 进行每个word进行记录,然后就可以在利用堆排序中找到10个最频率最高的。
1:记录每个单词的个数;
一种做法:
其实C++有提供hashmap来进行记录key_value进行存放。用法像:map
二种做法:
自己定义hash函数进行key_value的存放:
2:
利用记录过的《单词,次数》进行排序;
第一种做法:每次找一个最大的,这样找10次那就效率是O(n*10);
第二种做法:用堆来进行:效率为O(n*log10)这个效率比较高一点。
下面是选用:自己定义hash函数进行key_value的存放:第二种做法:用堆来进行:效率为O(n*log10)这个效率比较高一点。下面代码是实现:代码的优化还有很大空间如:单词的分割只考虑空格,其它的方式先不考虑。
//dazhengshumore.cpp
C++
- #include <cstdio>
- #include <cstdlib>
- #include <iostream>
- #include <cstring>
- #include <fstream>
- #include <sstream>
- #include <stdexcept>
- #define MAX 10000
- #define M 10
- #define DEBUG
- using namespace std;
- class node_C{
- private: char *data;
- int coutnum;
- public :
- node_C():data(NULL),coutnum(0){};
- node_C(const char *t):coutnum(0){
- data=new char[strlen(t)+1];
- strcpy(data,t);
- };
- node_C(const node_C &node_t){
- if(node_t.data==NULL)
- return ;
- data=new char[strlen(node_t.data)+1];
- strcpy(data,node_t.data);
- coutnum=node_t.coutnum;
- }
- node_C & operator =(const node_C & node_t){
- if(this==&node_t)
- return *this;
- node_C temp=node_C(node_t);
- char *temp_t=temp.data;
- temp.data=data;
- data=temp_t;
- coutnum=temp.coutnum;
-
- }
- bool operator ==(const node_C &node_t){
- if(this==&node_t) return true;
- if((node_t.coutnum==coutnum)&&(strcmp(node_t.data,data)==0))
- return true;
- return false;
- }
- int operator++(){
- return ++coutnum;
- }
- int operator++(int){
- int temp=coutnum;
- coutnum++;
- return temp;
- }
- void display(){
- cout<<data<<" "<<coutnum<<endl;
- }
- int get_coutnum(){
- return coutnum;
- }
- void set_coutnum(int s_c){
- coutnum=s_c;
- }
- char *get_data(){
- return data;
- }
- void set_data(char *s_d){
- data=s_d;
- }
- ~node_C(){
- delete []data;
- }
- };
- //typedef struct node{
- // char *data;
- //int coutnum;
- //}node;
- unsigned int simple_hash(const char *str)
- {
- register unsigned int hash;
- register const char *p;
-
- for(hash = 0, p =str; *p ; p++)
- hash = 31 * hash + *p;
-
- return (hash & 0x7FFFFFFF)%MAX;
- }
-
- //读单词文件,并把它列出来
- ifstream& open_file(ifstream &in,const string &file){
- in.close();
- in.clear();
- in.open(file.c_str());
- return in;
- }
- class MakeminHeap{
- public:
- MakeminHeap(node_C *arr,int len);
- void MinHeapFixdown(node_C a[], int i, int n);
- void displayHead();
- void justHead(node_C *arr);
- ~MakeminHeap(){delete []counum;}
- private:
-
- node_C *counum;
- int num_len;
- int arr_i;
-
- };
- MakeminHeap::MakeminHeap(node_C *arr,int len):num_len(len){
- int i;
- arr_i=0;
- counum=new node_C[len];
- for(i=0;i<M;i++,arr_i++){
- while(arr_i<MAX){
- if(arr[arr_i].get_data()!=NULL) {
- counum[i]=arr[arr_i];
- break;}
- else arr_i++;
- }
- }
- for (i = len / 2 - 1; i >= 0; i--)
- MinHeapFixdown(counum, i, len);
- }
- void MakeminHeap::MinHeapFixdown(node_C a[], int i, int n)
- {
- int j, temp;
- node_C temp_n;
- temp_n= a[i];
- j = 2 * i + 1;
- while (j < n)
- {
- if (j + 1 < n && a[j + 1].get_coutnum() < a[j].get_coutnum()) //在左右孩子中找最小的
- j++;
- if (a[j].get_coutnum() >= temp_n.get_coutnum())
- break;
- // a[i] = a[j]; //把较小的子结点往上移动,替换它的父结点
- a[i]=a[j];
- i = j;
- j = 2 * i + 1;
- }
- a[i]=temp_n;
- }
-
- void MakeminHeap::justHead(node_C *arr){
- int i;
- for(i=arr_i;i<MAX;i++){
- if(arr[i].get_coutnum()>counum[0].get_coutnum()){
- counum[0]=arr[i];
- MinHeapFixdown(counum, 0, num_len);
- }
- }
- }
- void MakeminHeap::displayHead(){
- int i;
- for(i=0;i<num_len;i++)
- cout<<"<"<<counum[i].get_data()<<">"<<" has currented:"<<counum[i].get_coutnum()<<"times "<<endl;
- cout<<endl;
- }
-
- int main(int argc, char **argv)
- {
- int i;
- ifstream map_file;
- node_C *arr=new node_C[MAX];
- if(argc!=2)
- throw runtime_error("wrong number of argument");
- if(!open_file(map_file,argv[1])) throw runtime_error("wrong open of file");
- string word,line;
- while(getline(map_file,line)){
- istringstream stream(line);
- while(stream>>word)
- {
- //cout<<word<<" ";//word 的string以空格为结尾。
- const char *wor_temp=word.c_str();
- unsigned int temp=simple_hash(wor_temp);
- while(arr[temp].get_data()!=NULL){
- if(strcmp(arr[temp].get_data(),word.c_str())==0)
- break;
- else
- temp++;
- }
- if(arr[temp].get_data()==NULL) {
- // char *arr_t=new char[strlen(wor_temp)+1];
- //strcpy(arr_t,wor_temp);
- //arr[temp].set_data(arr_t);
- node_C temp_node=node_C(wor_temp);
- arr[temp]=temp_node;
- }
- // ++arr[temp];//有点不规范!
- arr[temp].set_coutnum(arr[temp].get_coutnum()+1);
- }
- }
- #ifdef DEBUG
- for(i=0;i<MAX;i++)//进行对于arr进行堆的排序提取全10位
- if(arr[i].get_data()!=NULL)
- cout<<"<"<<arr[i].get_data()<<">"<<" have currented:"<<arr[i].get_coutnum()<<"times "<<endl;
- cout<<"*************************************************"<<endl;
- #endif
-
- MakeminHeap tempHeap(arr,M);
- //tempHeap.displayHead();
- tempHeap.justHead(arr);
- cout<<"display the ten most frequent words:"<<endl;
- tempHeap.displayHead();
- return 0;
- }
-
-
-
-
-
-
-

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。