-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathioput.py
95 lines (82 loc) · 3.38 KB
/
ioput.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import os
import datetime
from Clawer_Base.logger import logger
class Type_Input:
def __init__(self,type_file_path, col_name, res_floder, method=''):
if '.xlsx' in type_file_path or '.xls' in type_file_path:
self.df = pd.read_excel(type_file_path, index_col= col_name)
elif '.csv' in type_file_path:
self.df = pd.read_csv(type_file_path, index_col= col_name)
else:
print('类型文件必须是excel 或 csv 文件')
self.method = method
self.res_floder = res_floder
self.type_list = self.filter_by_method()
self.type = ''
def check_result(self):
if os.path.exists(self.res_floder):
file_list = []
for (root,dirs,files) in os.walk(self.res_floder):
file_list += files
type_list = [i.split('.')[0].split('_')[0] for i in set(file_list)]
return type_list
else:
return
def filter_by_method(self):
existing = self.check_result()
if self.method == 'add' and existing:
input_type = list(set(self.df.index) - set(existing))
else:
input_type = list(self.df.index)
return input_type
# def get_type(self):
# if self.type_list:
# self.type = self.type_list.pop()
# return self.type
# else:
# return
#
# def get_cname(self):
# return self.df.loc[self.type, 'cname']
def convert_to_param_dict(self):
a_dict = {}
a_dict['types'] = self.type
return a_dict
def save_path_dict(self,floder_path):
self.result_path = floder_path + '/' + '%s_result.csv'%(self.type)
self.point_by_radius = floder_path + '/' + '%s_point_by_radius.csv'%(self.type)
self.point_by_result = floder_path + '/' + '%s_point_by_results.csv'%(self.type)
return self.result_path, self.point_by_radius, self.point_by_result
class Res_saver:
"""抓取结果保存器"""
def __init__(self, res_list, file_name, floder_path='result', file_type='csv', duplicates_key=''):
self.res_list = res_list
self.file_name = file_name
self.floder_path = floder_path
self.date_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
self.file_type = file_type
self.duplicates_key = duplicates_key
def save_as_file(self):
if self.floder_path:
if os.path.exists('%s/%s' %(self.floder_path, self.date_str[:8])):
pass
else:
os.makedirs('%s/%s' %(self.floder_path, self.date_str[:8]))
df = pd.DataFrame(self.res_list)
if self.res_list:
if self.duplicates_key:
df.drop_duplicates(self.duplicates_key, inplace=True)
else:
print('%s 结果为空'% self.file_name)
logger.info('%s 结果为空'% self.file_name)
if self.file_type == 'csv':
df.to_csv('%s/%s/%s_%s.csv' % (self.floder_path, self.date_str[:8], self.file_name, self.date_str[8:]))
elif self.file_type == 'excel':
df.to_excel('%s/%s/%s_%s.xlsx' % (self.floder_path, self.date_str[:8], self.file_name, self.date_str[8:]))
else:
"""预留以后的数据库格式"""
pass
print('%s 结果已保存' % self.file_name)
# class Mongo_dumper:
# def __init__(self,dbname):