-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstring_generator.py
121 lines (97 loc) · 3.5 KB
/
string_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import random as rnd
import re
import string
import requests
from bs4 import BeautifulSoup
def create_strings_from_file(filename, count):
"""
Create all strings by reading lines in specified files
"""
strings = []
with open(filename, "r", encoding="utf8") as f:
lines = [l[0:200] for l in f.read().splitlines() if len(l) > 0]
if len(lines) == 0:
raise Exception("No lines could be read in file")
while len(strings) < count:
if len(lines) >= count - len(strings):
strings.extend(lines[0: count - len(strings)])
else:
strings.extend(lines)
return strings
def create_strings_from_dict(length, allow_variable, count, lang_dict):
"""
Create all strings by picking X random word in the dictionnary
"""
dict_len = len(lang_dict)
strings = []
for _ in range(0, count):
current_string = ""
for _ in range(0, rnd.randint(1, length) if allow_variable else length):
current_string += lang_dict[rnd.randrange(dict_len)]
current_string += " "
strings.append(current_string[:-1])
return strings
def create_strings_from_wikipedia(minimum_length, count, lang):
"""
Create all string by randomly picking Wikipedia articles and taking sentences from them.
"""
sentences = []
while len(sentences) < count:
# We fetch a random page
page_url = "https://{}.wikipedia.org/wiki/Special:Random".format(lang)
try:
page = requests.get(page_url, timeout=3.0) # take into account timeouts
except requests.exceptions.Timeout:
continue
soup = BeautifulSoup(page.text, "html.parser")
for script in soup(["script", "style"]):
script.extract()
# Only take a certain length
lines = list(
filter(
lambda s: len(s.split(" ")) > minimum_length
and not "Wikipedia" in s
and not "wikipedia" in s,
[
" ".join(re.findall(r"[\w']+", s.strip()))[0:200]
for s in soup.get_text().splitlines()
],
)
)
# Remove the last lines that talks about contributing
sentences.extend(lines[0: max([1, len(lines) - 5])])
return sentences[0:count]
def create_strings_randomly(length, allow_variable, count, let, num, sym, lang):
"""
Create all strings by randomly sampling from a pool of characters.
"""
# If none specified, use all three
if True not in (let, num, sym):
let, num, sym = True, True, True
pool = ""
if let:
if lang == "cn":
pool += "".join(
[chr(i) for i in range(19968, 40908)]
) # Unicode range of CHK characters
else:
pool += string.ascii_letters
if num:
pool += "0123456789"
if sym:
pool += "!\"#$%&'()*+,-./:;?@[\\]^_`{|}~"
if lang == "cn":
min_seq_len = 1
max_seq_len = 2
else:
min_seq_len = 2
max_seq_len = 10
strings = []
for _ in range(0, count):
current_string = ""
for _ in range(0, rnd.randint(1, length) if allow_variable else length):
seq_len = rnd.randint(min_seq_len, max_seq_len)
current_string += "".join([rnd.choice(pool) for _ in range(seq_len)])
current_string += " "
strings.append(current_string[:-1])
return strings