-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2_dopa_get_species.py
200 lines (161 loc) · 7.12 KB
/
2_dopa_get_species.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import requests
import pandas as pd
from io import StringIO
import yaml
import os
import urllib.parse
"""
This block:
- fixes spelling of scientific names of species
- fetches IUCN IDs of species
- fetches all available columns by IUCN ID
- concatenates it into the table with all available columns for each species
INPUT
- Table of species with scientific names filled in by user. One column is required ('scientificName').
Format: CSV
Mandatory: yes
OUTPUT
- Combined table with scientific names of species and columns from IUCN.
Format: CSV
Mandatory: yes
"""
# open configuration files
with open('config.yaml', 'r') as file:
config = yaml.safe_load(file)
# paths from the config file
input_dir = config['input_dir']
output_dir = config['output_dir']
# input file from the config
input_species_csv = os.path.join(input_dir, config['input_species'])
output_iucn_csv = os.path.join(output_dir, config['iucn_csv'])
# Debug: Print paths
print(f"Path to the input CSV with scientific names: {input_species_csv}")
print(f"Path to the output CSV with IUCN data: {output_iucn_csv}")
print('-' * 40)
# 1st function to fetch IUCN IDs by scientific names
def fetch_id_from_name_IUCN(scientific_name):
"""
Fetches IUCN species IDs through the DOPA REST service.
Parameters:
- species_name: The scientific name of the species.
Returns:
- IUCN ID or None if not found.
"""
url = "https://dopa-services.jrc.ec.europa.eu/services/d6dopa/dopa_43/get_dopa_species_list"
params = {
"format": "json",
"f_binomial": scientific_name,
"includemetadata": "true",
"fields": "id_no"
}
encoded_params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
response = requests.get(url, params=encoded_params)
try:
if response.status_code in [200, 201]:
response_data = response.json()
if 'records' in response_data and len(response_data['records']) > 0:
iucn_id = response_data['records'][0]['id_no']
print(f"Unique IUCN ID for {scientific_name}: {iucn_id}")
return iucn_id
else:
print(f"No IUCN ID found for species: {scientific_name}")
return None
else:
print(f"Error fetching IUCN ID for {scientific_name}: {response.status_code}")
return None
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# 2nd function to fetch all available data by IUCN IDs
def fetch_IUCN_data_by_id(iucn_id):
"""
Fetches IUCN data (habitats, threats, etc.) by IUCN IDs through the DOPA REST service for each species.
Parameters:
- a_id_no: IUCN unique ID of the species.
Returns:
- IUCN data as a dictionary or None if not found.
"""
url = "https://dopa-services.jrc.ec.europa.eu/services/d6dopa/dopa_43/get_dopa_species"
params = {
"format": "json",
"a_id_no": iucn_id,
"includemetadata": "true",
"fields": "binomial,research_needed_code,genus,family,research_needed_name,order_,class,id_no,"
"conservation_needed_code,usetrade_code,conservation_needed_name,ecosystems,habitat_code,"
"usetrade_name,habitat_name,country_code,country_name,stress_code,stress_name,threat_code,"
"threat_name,endemic,country_n,threatened,category"
}
try:
response = requests.get(url, params=params)
if response.status_code in [200, 201]:
species_data = response.json()
if 'records' in species_data and len(species_data['records']) > 1:
print(f"Detailed IUCN data found for ID: {iucn_id}.")
return species_data
else:
print(f"No detailed IUCN data found for ID: {iucn_id}.")
return None
else:
print(f"Error fetching species details for ID: {iucn_id}: {response.status_code}")
return None
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# main function to fetch IUCN data using species names from the CSV
def dopa_fetch_iucn(input_species_csv):
"""
Fetches IUCN species data from the DOPA REST service for species listed in an input CSV file.
Parameters:
- input_species_csv: A path to the input CSV file with the scientific names of species.
Returns:
- df_list: A list of dataframes, each containing data for one species.
"""
df_list = []
# read species names directly from the CSV
try:
species_df = pd.read_csv(input_species_csv)
first_column = species_df.iloc[:, 0]
species_list = first_column.tolist()
except (FileNotFoundError, KeyError) as e:
print(f"Error reading species from CSV: {e}")
return []
for species_name in species_list:
# Step 1: fetch the IUCN ID using the species name
iucn_id = fetch_id_from_name_IUCN(species_name)
if iucn_id:
# Step 2: fetch full IUCN data using the IUCN ID
iucn_data = fetch_IUCN_data_by_id(iucn_id)
if iucn_data:
# convert JSON to dataframe
df = pd.DataFrame(iucn_data['records'])
df_list.append(df)
print('-'*40)
else:
print(f"No detailed data found for {species_name}.")
print('-'*40)
else:
print(f"No IUCN ID found for {species_name}.")
print('-'*40)
return df_list
# fetch the data and list of species, get a list of dataframes
df_list = dopa_fetch_iucn(input_species_csv)
# combine all dataframes into one
combined_df = pd.concat(df_list, ignore_index=True, sort=False)
# group the dataframe by 'id_no'
grouped_df = combined_df.groupby('id_no')
# concatenate values for the rest of the columns
concat_columns = combined_df.drop(columns = ['id_no']).columns.tolist() # create a list of all columns, excluding ID
# select only the columns that need to be concatenated
df_columns_to_concat = grouped_df[concat_columns]
# define a function to concatenate non-null, unique values with '|'
def concatenate_unique_values(column):
return '|'.join(column.dropna().astype(str).unique()) # remove null values, convert to string, choose only unique values and concatenate with '|' separator
# apply the concatenation function to each column in the group
concatenated_df = df_columns_to_concat.agg(concatenate_unique_values)
# reset the index so that 'id_no' becomes a regular column again
df_final = concatenated_df.reset_index()
# save the final dataframe to a new CSV file
df_final.to_csv(output_iucn_csv, index=False, sep='|')
print (f"Data from IUCN has been fetched and concatenated for the species in {input_species_csv}")
# TODO - to implement other scopes of IUCN assessment, continental and regional ones(not only global ones): https://www.iucnredlist.org/regions/european-red-list, https://www.iucnredlist.org/regions/mediterranean-red-lis
# TODO - check Catalonian red list through Datos Gob ES API: https://datos.gob.es/es/apidata (not through Socrata API). SPARQL available: https://datos.gob.es/en/sparql