-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSimpleTokenizer.h
67 lines (59 loc) · 2.33 KB
/
SimpleTokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#pragma once
#include <iostream>
#include <tuple>
#include <regex>
#include <string>
#include <unordered_map>
#include <vector>
#include "common.h"
namespace std
{
struct WstringHash {
std::size_t operator()(const std::wstring& str) const {
return std::hash<std::wstring>()(str);
}
};
struct TupleHash {
template <typename T>
std::size_t operator()(const T& tuple) const {
std::size_t seed = 0;
hash_combine(seed, std::get<0>(tuple));
hash_combine(seed, std::get<1>(tuple));
return seed;
}
template <typename T>
void hash_combine(std::size_t& seed, const T& value) const {
seed ^= std::hash<T>()(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
};
}
/// <summary>
/// 文本转换为数字向量
/// </summary>
class SAM_EXPORTS SimpleTokenizer
{
public:
SimpleTokenizer();
virtual ~SimpleTokenizer();
void Init();//初始化成员变量
std::vector<int64_t> tokenlize(std::wstring textpromot);
protected:
std::wregex pat;//正则表达式
const int contextLength = 77;//上下文长度,句子长度
std::unordered_map<std::wstring, int> encoder;
std::unordered_map<int, std::wstring> decoder;
std::unordered_map<int, std::wstring> byte_encoder;
std::unordered_map<std::wstring, int> byte_decoder;
std::unordered_map<std::wstring, std::wstring> cache;
std::unordered_map<std::tuple<std::wstring, std::wstring>, int, std::TupleHash> bpe_ranks;
std::wstring bpe(const std::wstring& token);
std::wstring whitespace_clean(std::wstring text);
std::unordered_map<int, std::wstring> BytesToUnicode();
std::wstring Decode(const std::vector<int>& tokens);
std::vector<int64_t> Encode(const std::wstring& text);
std::vector<std::tuple<std::wstring, std::wstring>> LoadBPEMerges(const std::wstring& bpePath);
std::vector<std::pair<std::wstring, std::wstring>> GetPairs(const std::vector<std::wstring>& words);
std::vector<std::wstring> split(const std::wstring& str, wchar_t delimiter);
std::tuple<std::wstring, std::wstring> FindFirstPair(const std::vector<std::pair<std::wstring, std::wstring>>& pairs,
const std::unordered_map<std::tuple<std::wstring, std::wstring>, int, std::TupleHash>& bpe_ranks);
};