-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_collector.py
190 lines (153 loc) · 5.73 KB
/
data_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
from googlesearch import search
from multiprocessing import Manager, Pool
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
import re
import random
import multiprocessing
sites_crawled = []
def get_url(query):
global sites_crawled
time.sleep(1)
parts = query.split()
query = '"' + " ".join(parts[:len(parts)//2+1]) + '"'
print("Searching:", query)
url = ""
for url in search(query, stop = 1, num=1):
print("URL:", url)
if url not in sites_crawled:
sites_crawled.append(url)
else:
url = ""
print("Skipping ", query)
return url
def google_sentences(url):
content = []
try:
response = requests.get(url, timeout=30)
soup = BeautifulSoup(response.content, "html.parser")
paragraph_texts = soup.find_all('p')
content = [re.sub(r'<.+?>', r'', str(p)) for p in paragraph_texts]
content = [re.sub(
r"[\"\#\%\&\(\)\*\+\/\:\<\=\>\@\[\\\]\^\_\`\{\|\}\~]+", " ", document) for document in content]
content = [re.sub(r"[ \t\n\r\x0b\x0c]+", " ", document).split('.')
for document in content]
content = [item for sublist in content for item in sublist]
content = list(set(content))
except:
pass
return content
def get_sentences(query, content):
sentences = []
org_parts = query.split(' ')
if len(org_parts) > 1:
#any one of first half of words?
#org_main =
org_main = " ".join(org_parts[: random.randint(len(org_parts)//2 + 1, len(org_parts)-1)]) # never searches entire name
else:
org_main = org_parts[0]
for line in content:
if len(sentences) <= 100: # take max 5 sentences per company name, it will reduce on further filtering
line_words = line.split(' ')
if len(line_words) > len(org_parts):
if (org_main in line): # to get longer meaningful sentences, not just a line with just the company name
sentences.append(line)
else:
break
return sentences
def get_query_tags(query, main_tag, length):
first_tag = ["B_" + main_tag]
inner_tags = ["I_" + main_tag] * (length -1)
tags = first_tag + inner_tags
return tags
def get_B_indices(sent_words, B_word):
indices = [i for i, x in enumerate(sent_words) if x == B_word]
return indices
def tag_sentence(sent, query_words):
sent_words = re.findall(r'\w+', sent)
B_indices = get_B_indices(sent_words, query_words[0])
X_one = sent_words
y_one = ['O'] * len(sent_words)
for B_index in B_indices:
try:
#20 Microns company, I was born in 2020, will also come here, and get eliminated after tokenization
#B_index = words.index(query_words[0])
y_one[B_index] = "B_ORG"
count = 1
for i in range(B_index + 1, B_index + 1 + len(query_words) - 1): # to avoid index out of bound
if i < len(sent_words):
if sent_words[i].lower() == query_words[count].lower():
y_one[i] = "I_ORG"
else:
break
count = count + 1
else:
break
except:
pass
X_one.append('.')
y_one.append('O')
X_one.append('\n')
y_one.append('\n')
# count holds number of words in company name found in sentence
# eliminating some possible errors
'''
capital_in_sent = sum(1 for c in sent if c.isupper())
capital_in_query = 0
for word in query_words:
capital_in_query = capital_in_query + sum(1 for c in word if c.isupper())
if B_index == 0:
if capital_in_sent > 3 * capital_in_query:
X_one = []
y_one = []
else:
if capital_in_sent > 3 * (capital_in_query + 1):
X_one = []
y_one = [] '''
return X_one, y_one
def make_data(query, res_list):
url = get_url(query)
X_onequery = []
y_onequery = []
# skip if company has same site url as some other cause sentences from that site are already scrapped
if url != "":
content = google_sentences(url)
if content != []:
sentences = get_sentences(query, content)
query_words = re.findall(r'\w+', query)
#tags = get_query_tags(query, "ORG", len(query_words))
for sent in sentences:
print(sent)
X_sent, y_sent = tag_sentence(sent, query_words)
X_onequery = X_onequery + X_sent
y_onequery = y_onequery + y_sent
else:
print("Empty content- ignoring")
X_y_onequery = [(X_onequery[i], y_onequery[i]) for i in range(0, len(y_onequery))]
res_list.append(X_y_onequery)
print("DATA OF ONE QUERY:", X_y_onequery)
import csv
def write_csv(final_res):
with open('org_data_6.csv', 'w', newline = '') as f:
f.write('\n'.join('%s %s' % x for x in final_res))
if __name__ == "__main__":
df = pd.read_csv("bse_companies.csv", encoding = "ISO-8859-1", usecols = ['Company Name'])
names = list(df['Company Name'])
names = names[5001:]
manager = Manager()
pool = Pool(4)
res_list = manager.list()
print(len(names))
start = time.time()
for name in names:
print("---------------------- Scraping for", name , "------------------------")
pool.apply_async(make_data, (name, res_list,))
pool.close()
pool.join()
end = time.time()
print("Time taken: ", end - start)
print("FINAL: \n",res_list)
final_res = [item for sublist in res_list for item in sublist]
write_csv(final_res)