-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathgithub_action.check_spiders_scheduled.py
38 lines (28 loc) · 1.24 KB
/
github_action.check_spiders_scheduled.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
from os.path import isfile, join
def checkdiff(required, testing):
diff = [x for x in required if x not in testing]
return diff
def verify_spiders_are_scheduled():
current_file = os.path.realpath(__file__)
current_dir = os.path.dirname(current_file)
spiders_dir = f'{current_dir}/dataPipelines/gc_scrapy/gc_scrapy/spiders'
spiders_in_dir = [f.replace('.py', '') for f in os.listdir(spiders_dir) if isfile(
join(spiders_dir, f)) and not f.startswith("_")]
schedule_dir = f"{current_dir}/paasJobs/crawler_schedule"
spiders_in_schedule = []
for f_name in os.listdir(schedule_dir):
if isfile(join(schedule_dir, f_name)) and f_name.endswith('.txt'):
with open(join(schedule_dir, f_name)) as f:
for line in f.readlines():
if line.strip():
spiders_in_schedule.append(
line.strip().replace('.py', ''))
unused = checkdiff(spiders_in_dir, spiders_in_schedule)
if len(unused):
msg = f"ERROR: Spider(s) not used in a schedule: {unused}"
raise RuntimeError(msg)
else:
print("All spiders are in a schedule file")
if __name__ == "__main__":
verify_spiders_are_scheduled()