Skip to content

Commit

Permalink
[+] Added hard-coded switch for different dictionary types
Browse files Browse the repository at this point in the history
[+] Stored output for OpenOffice dictionary
[*] Structure altered a bit
  • Loading branch information
mmatrosov committed Oct 14, 2013
1 parent f1ae39f commit 14c86da
Show file tree
Hide file tree
Showing 6 changed files with 202,327 additions and 145,880 deletions.
79 changes: 72 additions & 7 deletions WordPicker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@

using namespace std;

enum class DictType
{
Freq, OpenOffice
};

//////////////////////////////////////////////////////////////////////////
///
vector<wstring> ReadWords()
vector<wstring> ReadWordsFreq()
{
#ifdef _DEBUG
string dictPath = R"(data\freqrnc2011_crop.csv)";
Expand All @@ -20,10 +25,6 @@ vector<wstring> ReadWords()
throw runtime_error("dictionary not found");
}

cout << "Reading dictionary..." << endl;

auto start = chrono::steady_clock::now();

// An instance of codecvt is deleted by the locale object.
locale utf8(locale::empty(), new codecvt_utf8<wchar_t>);
input.imbue(utf8);
Expand Down Expand Up @@ -56,9 +57,73 @@ vector<wstring> ReadWords()
transform(table.begin(), table.end(), words.begin(),
[](const Cell& cell) { return move(cell.second); });

return words;
}

//////////////////////////////////////////////////////////////////////////
///
vector<wstring> ReadWordsOpenOffice()
{
#ifdef _DEBUG
string dictPath = R"(data\ru_RU_crop.dic)";
#else
string dictPath = R"(data\ru_RU.dic)";
#endif

wifstream input(dictPath);

if (!input)
{
cerr << "Dictionary not found in \"" + dictPath + "\"!" << endl;
throw runtime_error("dictionary not found");
}

// An instance of codecvt is deleted by the locale object.
locale utf8(locale::empty(), new codecvt_utf8<wchar_t>);
input.imbue(utf8);

vector<wstring> words;

wregex mask(L"([а-я]+)(/.*)?");
wsmatch mr;

wstring line;

while (getline(input, line))
{
if (regex_match(line, mr, mask))
{
wstring word = mr[1].str();
words.push_back(move(word));
}
}

return words;
}

vector<wstring> ReadWords(DictType type)
{
vector<wstring> words;

cout << "Reading dictionary..." << endl;

auto start = chrono::steady_clock::now();

switch (type)
{
case DictType::Freq:
words = ReadWordsFreq();
break;
case DictType::OpenOffice:
words = ReadWordsOpenOffice();
break;
default:
throw logic_error("Unknown dictionary type!");
}

auto finish = chrono::steady_clock::now();

cout << "Done " << words.size() << " words in " <<
cout << "Done " << words.size() << " words in " <<
chrono::duration_cast<chrono::milliseconds>(finish - start).count() << "ms" << endl;

return words;
Expand Down Expand Up @@ -117,7 +182,7 @@ int _tmain(int argc, _TCHAR* argv[])
try
{
// Initialize words list
auto words = ReadWords();
auto words = ReadWords(DictType::OpenOffice);

MatchPatterns(words);
}
Expand Down
Loading

0 comments on commit 14c86da

Please sign in to comment.