-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess-ajustes.py
137 lines (108 loc) · 5.45 KB
/
process-ajustes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# Use this to prepare data on Ajustes Diretos (non-bid contracts) from the
# Portuguese government for import into Open Spending. The data is provided in
# a file with one JSON document per line and processed into a CSV.
# Source of the data: http://www.base.gov.pt
# Usage: $ python process-ajustes.py [file-name]
import csv
import json
import sys
import os
import fileinput
from datetime import datetime
# Check if the file-name is passed as an argument
if len(sys.argv) > 1:
file_name = sys.argv[1]
file_in = file_name + '.json'
file_out = file_name + '.csv'
else:
print 'Please provide the name of the file that should be processed. Eg. $ python process-ajustes.py [data_file]'
exit (1)
# Check if there is a JSON with that name
if os.path.isfile(file_in) == False:
print 'Oh my, something went wrong. Are you sure there is a file called \'' + file_in + '\'?'
exit(1)
# Check if output csv already exists.
if os.path.isfile(file_out):
print 'There is already a CSV file named ' + file_out + '.'
exit(1)
with open(file_out, "w") as file:
csv_file = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
# Write the header
csv_file.writerow(['signing_date', 'description', 'amount', 'execution_deadline', 'execution_place', 'publication_date', 'id', 'cpvs_id', 'cpvs_description', 'contract_types', 'object_description', 'contracting_procedure_type', 'contract_fundamentation', 'direct_award_fundamentation', 'contracted_id', 'contracted_nif', 'contracted_name', 'contracting_id', 'contracting_nif', 'contracting_name', 'observations', 'close_date', 'cause_deadline_change', 'total_price', 'cause_price_change'])
# Loop over each of the contracts
for line in fileinput.input(file_in):
try:
item = json.loads(line)
except ValueError as ex:
continue
# The dataset contains some contracts that are not Ajustes Diretos
if not item['contractingProcedureType'] == "Ajuste directo":
continue
# Clean up the currency field: 1) remove thousand seperator; 2) remove
# the currency; and 3) substitute decimal mark ',' for '.'
list_prices = ['initialContractualPrice', 'totalEffectivePrice']
for price in list_prices:
if item[price]:
item[price] = item[price].encode('utf-8').translate(None, '. €').replace(',', '.')
# Some signing dates are emtpy. Set empty ones to the publication date
# to prevent Openspending from rejecting the complete row.
if item['signingDate'] == None:
item['signingDate'] = item['publicationDate']
# Convert dates to proper format (eg. 2013-10-23)
list_dates = ['publicationDate', 'signingDate', 'closeDate']
for date in list_dates:
if item[date]:
item[date] = datetime.strptime(item[date], '%d-%m-%Y').date()
# Split the CPV code from its description
# Failsafe
cpvs_id = ''
cpvs_description = ''
try:
cpvs_pieces = item['cpvs'].split(',', 1)
cpvs_id = cpvs_pieces[0].strip()
cpvs_description = cpvs_pieces[1].strip().encode('utf-8')
except:
pass
# Encode the texts
list_longtexts = ['description', 'executionDeadline', 'observations', 'contractTypes', 'objectBriefDescription', 'contractingProcedureType', 'contractFundamentationType', 'directAwardFundamentationType', 'causesDeadlineChange', 'causesPriceChange']
for longtext in list_longtexts:
if item[longtext]:
item[longtext] = item[longtext].encode('utf-8').replace("\r\n", " ").replace("\n\r", " ").replace("\r", " ").replace("\n", " ")
# Multiple locations are split with '<BR/>' substitute these for ' | '
item['executionPlace'] = item['executionPlace'].encode('utf-8').replace('<BR/>', ' | ')
# In some cases there are several contracting and contracted entities.
# Since openspending doesn't support this, we combine these into a new
# unique entity.
# Process the contracted entities
list_nif = []
list_id = []
list_name = []
for contracted in item['contracted']:
if isinstance(contracted['nif'], int) :
list_nif.append(str(contracted['nif']))
else :
list_nif.append(contracted['nif'].encode('utf-8'))
list_id.append(str(contracted['id']))
list_name.append(contracted['description'].encode('utf-8'))
contracted_nif = ' | '.join(list_nif)
contracted_id = ' | '.join(list_id)
contracted_name = ' | '.join(list_name)
#Process the contracting entities
list_nif = []
list_id = []
list_name = []
for contracting in item['contracting']:
if isinstance(contracting['nif'], int) :
list_nif.append(str(contracting['nif']))
else :
list_nif.append(contracting['nif'].encode('utf-8'))
list_id.append(str(contracting['id']))
list_name.append(contracting['description'].encode('utf-8'))
contracting_nif = ' | '.join(list_nif)
contracting_id = ' | '.join(list_id)
contracting_name = ' | '.join(list_name)
# Write everything to the CSV file in the right order
csv_file.writerow([item['signingDate'], item['description'], item['initialContractualPrice'], item['executionDeadline'], item['executionPlace'], item['publicationDate'], item['id'], cpvs_id, cpvs_description, item['contractTypes'], item['objectBriefDescription'], item['contractingProcedureType'], item['contractFundamentationType'], item['directAwardFundamentationType'], contracted_id, contracted_nif, contracted_name, contracting_id, contracting_nif, contracting_name, item['observations'], item['closeDate'], item['causesDeadlineChange'], item['totalEffectivePrice'], item['causesPriceChange']])
print 'Data was written to ' + file_out + '.'