diff --git a/PB14000000/hw1.cpp b/PB14000000/hw1.cpp deleted file mode 100644 index 6c88fd9..0000000 --- a/PB14000000/hw1.cpp +++ /dev/null @@ -1 +0,0 @@ -for example diff --git a/PB15000175/hw1.cpp b/PB15000175/hw1.cpp new file mode 100644 index 0000000..7e2a84c --- /dev/null +++ b/PB15000175/hw1.cpp @@ -0,0 +1,300 @@ +#include +#include +#include +#include +#include +#include +#include +using namespace std; + +void SearchFiles(char* dir,string filename[],int& fileNum); +void ReadFile(FILE *fp, unordered_map& wordValueMap,unordered_map& wordNameMap,int& chrtCount,int& wordCount,int& lineCount); +void WriteFile(ofstream& os, unordered_map wordValueMap, unordered_map wordNameMap, int chrtCount, int wordCount, int lineCount); +int GetTopTenWords(unordered_map wordValueMap, unordered_map wordNameMap, string topTenWordName[],int topTenWordNum[]); +int GetTopTenPhrases(unordered_map wordNameMap, string topTenPhraseName[], int topTenPhraseNum[],int wordCount); + +string allWords[20000000]; + +int main(int arc,char* args[]) +{ + char dir[200]; + string fileName[2000]; + int i, fileNum = 0,chrtCount = 0,wordCount=0, lineCount = 0; + FILE *fp; + + unordered_map wordValueMap; + unordered_map wordNameMap; + + strcpy_s(dir, args[1]); + SearchFiles(dir,fileName,fileNum); + cout << "File Number: " << fileNum << endl; + + for (i = 0; i < fileNum; i++) + { + if (fopen_s(&fp,fileName[i].c_str(), "r")!=0) + { + cerr << "Cannot open file " << fileName[i] << endl; + exit(1); + } + ReadFile(fp, wordValueMap,wordNameMap, chrtCount,wordCount,lineCount); + fclose(fp); + } + + cout << "Finish Reading. WordCount: " << wordCount << endl; + + ofstream os("result.txt", ios::out); + + if (!os) + { + cerr << "Cannot write file result.txt!" << endl; + exit(1); + } + + WriteFile(os, wordValueMap, wordNameMap, chrtCount,wordCount, lineCount); + os.close(); + cout << "Finish Writing" << endl; + + return 0; +} + +void SearchFiles(char* dir,string fileName[],int& fileNum) +{ + intptr_t handle; + _finddata_t findData; + char searchdir[200],newdir[200],newfile[200]; + + strcpy_s(searchdir, dir); + strcat_s(searchdir, "\\*"); + handle = _findfirst(searchdir, &findData); + + if (handle == -1) + { + cout << "Failed to find first file!\n"; + return; + } + + do + { + if (strcmp(findData.name, ".") != 0 && strcmp(findData.name, "..") != 0) + { + if (findData.attrib == _A_SUBDIR) + { + strcpy_s(newdir, dir); + strcat_s(newdir, "\\"); + strcat_s(newdir, findData.name); + SearchFiles(newdir,fileName,fileNum); + } + else + { + strcpy_s(newfile, dir); + strcat_s(newfile, "\\"); + strcat_s(newfile, findData.name); + fileName[fileNum]=newfile; + fileNum+=1; + } + } + } while (_findnext(handle, &findData) == 0); + + _findclose(handle); +} + + +void ReadFile(FILE *fp, unordered_map& wordValueMap, unordered_map& wordNameMap, int& chrtCount, int& wordCount, int& lineCount) +{ + char ch; + string word; + unsigned int i,j; + unordered_map::iterator itValue; + unordered_map::iterator itName; + + while ((ch = fgetc(fp)) != EOF) + { + if (ch >= 32 && ch <= 126) + chrtCount++; + + if ((ch >= 48 && ch <= 57) || (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122)) + word=word+ch; + + else + { + if (word.length() >= 4) + { + for (i = 0; i < 4; i++) + if (!(((word[i] >= 65) && (word[i] <= 90)) || ((word[i] >= 97) && (word[i] <= 122)))) + break; + if (i >= 4) + { + for ( j = word.length() - 1; word[j] >= 48 && word[j] <= 57; j--); + string newWord(word, 0, j + 1); + for (j = 0; j < newWord.length(); j++) + if (newWord[j] >= 65 && newWord[j] <= 90) + newWord[j] = newWord[j] + 32; + + allWords[wordCount] = newWord; + wordCount++; + + itValue = wordValueMap.find(newWord); + if (itValue == wordValueMap.end()) + wordValueMap.insert(pair(newWord, 1)); + + else + itValue->second++; + + itName = wordNameMap.find(newWord); + if (itName == wordNameMap.end()) + wordNameMap.insert(pair(newWord, word)); + + else if (word.compare(itName->second) < 0) + itName->second = word; + } + } + + word.erase(); + if(ch == '\n') + lineCount++; + } + } + + lineCount++; +} + + +void WriteFile(ofstream& os, unordered_map wordValueMap, unordered_map wordNameMap, int chrtCount, int wordCount, int lineCount) +{ + string topTenWordName[10]; + int topTenWordNum[10]; + string topTenPhraseName[10]; + int topTenPhraseNum[10]; + int i,wordNum,phraseNum; + + os << "Char_Number: " << chrtCount << endl; + os << "Line_Number: " << lineCount << endl; + os << "Words_Number " << wordCount << endl; + os << endl; + os << "The top ten frequency of words:" << endl; + + wordNum = GetTopTenWords(wordValueMap,wordNameMap,topTenWordName,topTenWordNum); + for (i = 0; i < wordNum; i++) + os << "<" << topTenWordName[i] << ">: " << topTenWordNum[i] << endl; + + os << endl; + os << "The top ten frequenzy of phrases:" << endl; + + phraseNum = GetTopTenPhrases(wordNameMap, topTenPhraseName, topTenPhraseNum, wordCount); + for (i = 0; i < phraseNum; i++) + os << "<" << topTenPhraseName[i] << ">: " << topTenPhraseNum[i] << endl; +} + +int GetTopTenWords(unordered_map wordValueMap, unordered_map wordNameMap, string topTenWordName[], int topTenWordNum[]) +{ + unordered_map::iterator itValue = wordValueMap.begin(); + unordered_map::iterator itName; + int i, j,wordNum; + for (i = 0; i < 10; i++) + { + topTenWordNum[i] = 0; + topTenWordName[i] = "\0"; + } + + while (itValue != wordValueMap.end()) + { + i = 9; + + while (itValue->second > topTenWordNum[i] && i >= 0) + i--; + + if (i < 9) + { + for (j = 9; j > i + 1; j--) + { + topTenWordNum[j] = topTenWordNum[j - 1]; + topTenWordName[j] = topTenWordName[j - 1]; + } + + topTenWordNum[i + 1] = itValue->second; + topTenWordName[i + 1] = itValue->first; + } + + itValue++; + } + + wordNum = wordValueMap.size(); + if (wordNum >= 10) + wordNum = 10; + + for (i = 0; i < wordNum; i++) + { + itName = wordNameMap.find(topTenWordName[i]); + topTenWordName[i] = itName->second; + } + + return wordNum; +} + +int GetTopTenPhrases(unordered_map wordNameMap, string topTenPhraseName[], int topTenPhraseNum[],int wordCount) +{ + unordered_map phraseMap; + unordered_map::iterator itPhrase; + unordered_map::iterator itName; + string phrase, word1, word2; + int i, j,phraseNum; + + for (i = 0; i < 10; i++) + { + topTenPhraseNum[i] = 0; + topTenPhraseName[i] = "\0"; + } + + for (i = 0; i < wordCount - 1; i++) + { + phrase = allWords[i] + "$" + allWords[i + 1]; + itPhrase = phraseMap.find(phrase); + if (itPhrase == phraseMap.end()) + phraseMap.insert(pair(phrase, 1)); + else + itPhrase->second++; + } + + itPhrase = phraseMap.begin(); + while (itPhrase != phraseMap.end()) + { + i = 9; + + while (itPhrase->second >topTenPhraseNum[i] && i >= 0) + i--; + + if (i < 9) + { + for (j = 9; j > i + 1; j--) + { + topTenPhraseNum[j] = topTenPhraseNum[j - 1]; + topTenPhraseName[j] = topTenPhraseName[j - 1]; + } + + topTenPhraseNum[i + 1] = itPhrase->second; + topTenPhraseName[i + 1] = itPhrase->first; + } + + itPhrase++; + } + + phraseNum = phraseMap.size(); + if (phraseNum >= 10) + phraseNum = 10; + + for (i = 0; i < phraseNum; i++) + { + for (j = 0; j < topTenPhraseName[i].length(); j++) + if (topTenPhraseName[i][j] == '$') + break; + word1.assign(topTenPhraseName[i], 0, j); + word2.assign(topTenPhraseName[i], j + 1, topTenPhraseName[i].length()); + itName = wordNameMap.find(word1); + topTenPhraseName[i].assign(itName->second); + topTenPhraseName[i].append(" "); + itName = wordNameMap.find(word2); + topTenPhraseName[i].append(itName->second); + } + + return phraseNum; +} diff --git a/README.md b/README.md deleted file mode 100644 index 16ba855..0000000 --- a/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# homework1 - homework1 USTC 2018春季学期 现代软件工程第一次作业 - -# ⚠️注意 - -1.新建以个人完整学号命名的文件夹(例如:PB14000000,PB大写),新建hw1.cpp文件(所有人统一命名为hw1.cpp,hw小写)。Eg:【homework1/PB14000000/hw1.cpp】 -提交教程:http://www.cnblogs.com/bingc/p/5080794.html - -2.所有代码内容(注释)不要出现汉字字符 - -3.不要使用预编译头文件 - -4.提交唯一一个完整的cpp文件(包含main函数,头文件) - -5.输入以命令行参数传入(文件夹的路径) - -6.在当前路径输出最终结果文件result.txt - -测试环境: -ubuntu 16.04 -g++ 5.4.0