-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path5_2_gridding.py
408 lines (322 loc) · 15.5 KB
/
5_2_gridding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
"""
Purpose: this block counts the number of GBIF species occurrences in pixels of the input raster dataset for the defined species or classes.
INPUT
- Raster dataset in any coordinate reference system (GeoTIFF). It can represent any natural or social characteristics of area, for example, vegetation classes or land use.
Format: GeoTIFF
Mandatory: yes
- GBIF occurrence datacube with unique records pre-extracted through GBIF occurrence data cube: https://techdocs.gbif.org/en/data-use/data-cubes.
Format: CSV (dataset) and JSON (metadata)
Mandatory: yes
OUTPUT
- Regridded GBIF occurrence datacube inheriting the specifications of the input raster dataset, which has one band with the calculated occurrence count.
Format: GeoTIFF
Mandatory: yes
ISSUES
- GBIF backbone taxonomy does not define Reptilia as a separate class (class with id=358 dedicated to Reptilia database). For the purposes of the case study, two Reptilia classes (Testudines, taxon key 11418114) and (Squamata, taxon key, 11592253) have been used.
- For largest datasets (Aves class) the following issue faced (17739293 records):
"numpy.core._exceptions._ArrayMemoryError: Unable to allocate 812. MiB for an array with shape (6, 17739293) and data type object".
It is solved by chunking and filtering out records outside of the bounding box.
"""
import pandas as pd
import numpy as np
from osgeo import gdal, ogr, osr, gdalconst
from pyproj import Transformer
import os
import re
import yaml
import warnings
import math
# REDUNDANT - replaced with configuration file
"""
# paths to input and output files
input_dir = 'input/'
output_dir = 'output/'
input_ds = 'ict_2022.tif'
"""
# load configuration from YAML file
with open('config.yaml', 'r') as file:
config = yaml.safe_load(file)
# load paths from config file
input_dir = config.get('input_dir')
output_dir = config.get('output_dir')
output_dir_gbif = config.get('output_dir_gbif')
# load filenames from config file
input_ds = config.get('input_ds')
gbif_datacube_csv = config.get('gbif_datacube_csv')
""" # REDUNDANT
# gbif_output_datacube = config.get('gbif_datacube_csv') # the same
"""
# load current taxon key(s)
taxon_key = config.get('gbif_taxon_key')
# path to input raster dataset
raster_path = os.path.join(input_dir, input_ds)
raster_path = os.path.normpath(raster_path)
# path to input GBIF datacube
csv_path = os.path.join(output_dir_gbif, gbif_datacube_csv)
csv_path = os.path.normpath(csv_path)
# path to transformed datacube
output_csv_path = os.path.join(output_dir, gbif_datacube_csv)
output_csv_path = os.path.normpath(output_csv_path)
# create output directory if doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# for exporting GBIF datacube to a new GeoTIFF file - output filename is the same as input dataset
gbif_datacube_tif = gbif_datacube_csv.replace('.csv', '.tif') # replace the extension
output_raster_path = os.path.join(output_dir, gbif_datacube_tif)
# for exporting GBIF datacube to a new band of input dataset - name from the base_name and extension
"""
# extract the base name (without extension) and the file extension
input_raster_base, extension = os.path.splitext(os.path.basename(input_ds))
# apply the renaming pattern from the config file
output_raster_pattern = config['output_raster']
output_raster = output_raster_pattern.format(input_raster_base=input_raster_base, extension=extension.lstrip('.'))
"""
# debug: print the paths
print(f"Input raster: {raster_path}")
print(f"Input GBIF datacube: {csv_path}")
print(f"Output raster: {output_raster_path}")
print(f"Current GBIF taxon key(s):{taxon_key}")
print("-" * 40)
# TODO - to loop over a list of classes and create a separate geotiff for each of them
"""
issue with classKey=212:
with numpy.core._exceptions._ArrayMemoryError: Unable to allocate 406. MiB for an array with shape (3, 17739293) and data type int64)
"""
# REDUNDANT - for exporting output raster as a band 2 (while keeping the original raster)
"""
# extract the base name of the input raster file and define the name of final raster output
input_filename = os.path.basename(raster_path)
match = re.search(r'(\d{4})', input_filename) # find 4 numbers in a row (year) through regular expression pattern
year_pos = match.start()
output_filename = input_filename[:year_pos] + 'gbif_' + input_filename[year_pos:] # slicing filename before the year position, inserts 'gbif_' and attach the year position with the following text
new_raster_path = os.path.join(output_dir, output_filename) # output raster
"""
# function to calculate pixel indices
def calculate_pixel_indices(x_cart, y_cart):
# calculate pixel indices
pixel_col = int((x_cart - raster_geo[0]) / raster_geo[1]) # column index
pixel_row = int((y_cart - raster_geo[3]) / raster_geo[5]) # row index
return pixel_row, pixel_col
# function to check if point is within raster extent
def point_within_raster_extent(x_cart, y_cart):
return (minx <= x_cart <= maxx) and (miny <= y_cart <= maxy)
## RASTER AND COORDINATES PREPARATION
# import the RasterTransform class from the reprojection module
from raster_proc import RasterTransform # this imports RasterTransform class
# check the cartesian/projected CRS
print("Checking the coordinate reference system of input raster dataset...")
is_cart, epsg_code = RasterTransform(raster_path).check_cart_crs()
# check the resolution
print("Checking the spatial resolution of input raster dataset...")
xres, yres = RasterTransform(raster_path).check_res()
# define EPSG codes for WGS84 and UTM zone 31N (EPSG:25831)
csv_crs = 'EPSG:4326' # default, because GBIF occurrence datacube always has this EPSG
raster_crs = epsg_code
# function to transform coordinates (in case of Catalonia from EPSG:4326 to EPSG:25831)
def transform_coordinates(lat_array, lon_array):
transformer = Transformer.from_crs(csv_crs, raster_crs, always_xy=True)
x, y = transformer.transform(lon_array, lat_array)
return x, y
# open raster to get its extent
raster_ds = gdal.Open(raster_path)
raster_geo = raster_ds.GetGeoTransform()
raster_band = raster_ds.GetRasterBand(1)
nodata_value = raster_band.GetNoDataValue() # extract nodata value
# calculate raster extent
minx, miny, maxx, maxy = (
raster_geo[0], # minx
raster_geo[3] + raster_geo[5] * raster_ds.RasterYSize, # miny
raster_geo[0] + raster_geo[1] * raster_ds.RasterXSize, # maxx
raster_geo[3] # maxy
)
# print raster extent for debugging
print("The spatial extent of the input raster dataset:", (minx, miny, maxx, maxy))
# transform coordinates if the EPSG code is not 4326 (doesn't match to CRS of GBIF datacube)
if epsg_code != 4326:
print(f"The input raster dataset has EPSG code:{epsg_code}, which is different from the projection of GBIF occurrence datacube.")
# previous version to define dataframe without chunks
"""
# read CSV file with the correct delimiter for SQL TSV ZIP
df = pd.read_csv(csv_path, delimiter='\t')
print(f"Processing the following dataset: \n{df.head()}")
"""
# debug
"""
# count total records
df = pd.read_csv(csv_path, delimiter='\t')
total_records_1 = len(df)
print (f"Total records: {total_records_1}")
"""
# to create empty dataframe to store the processed results
processed_df = pd.DataFrame()
# to create empty dataframe to store pixel counts
pixel_counts_df = pd.DataFrame()
# to read dataframe in chunks
# chunk the dataframe
n = 2000000 # chunk row size
df_chunks = pd.read_csv(csv_path, delimiter='\t', chunksize=n)
# find out the total number of rows in the CSV file
total_rows = sum(1 for _ in open(csv_path)) - 1 # subtract 1 for header
total_chunks = math.ceil(total_rows / n) # round up to the largest whole number
# initialise a counter of 'False' values in 'bbox' column
false_count = 0
# initialize total number of records in dataframe
total_records = 0
## process each chunk
# initialise chunk number
chunk_num = 1
for chunk in df_chunks:
# print progress
print(f"Processing chunk {chunk_num} out of {total_chunks}...")
"""
print(chunk.head()) # debug: printing chunk
"""
# count records in chunk
chunk_records = len(chunk)
# update the total number of records (increment)
total_records += chunk_records
# apply the function to each row in the dataframe to get transformed coordinates
print("Proceeding with coordinate transformation...")
chunk['x_cart'], chunk['y_cart'] = transform_coordinates(chunk['lat'].values, chunk['lon'].values)
print("Coordinates have been converted")
"""print(f"Converted coordinates saved to {output_csv_path}.")"""
"""
# debug:
print(df[['x_cart', 'y_cart']].head())
"""
# initialize the 'bbox' boolean column with False
chunk['bbox'] = False
# iterate through each row and update the 'bbox' column
for index, row in chunk.iterrows():
chunk.at[index, 'bbox'] = point_within_raster_extent(row['x_cart'], row['y_cart'])
if chunk.at[index, 'bbox'] == False:
false_count += 1
warnings.warn("Occurrence record(s) found outside of the bounding box of input raster dataset!")
# filter rows where bbox is True
bbox_true_df = chunk[chunk['bbox']]
# initialize 'pixel_row' and 'pixel_col' columns
bbox_true_df.loc[:, 'pixel_row'] = None # use .loc to set values to avoid the SettingWithCopyWarning (unpredictable behaviour)
bbox_true_df.loc[:, 'pixel_col'] = None
# assign pixel indices based on transformed coordinates for rows where bbox is True
for index, row in bbox_true_df.iterrows(): # or ['bbox' = 'True']
pixel_row, pixel_col = calculate_pixel_indices(row['x_cart'], row['y_cart'])
bbox_true_df.at[index, 'pixel_row'] = pixel_row
bbox_true_df.at[index, 'pixel_col'] = pixel_col
# update the chunk with the pixel row and column values
chunk.update(bbox_true_df)
# append the processed chunk to the results DataFrame
processed_df = pd.concat([processed_df, chunk], ignore_index=True)
# calculate pixel counts for the current chunk
pixel_counts_chunk = bbox_true_df.groupby(['pixel_row', 'pixel_col']).size()
# update the overall counts of rows in each pixel
pixel_counts_df = pd.concat([pixel_counts_df, pixel_counts_chunk], axis=0, sort=False)
# increment chunk number
chunk_num += 1
# calculate the share of records outside of the bounding box (for Catalonia bbox it is okay - we are fetching the datacube from Spain)
false_share = false_count/total_records
print(f"The share of records outside of the bounding box is {false_share:.2%}.")
print("-"*40)
# debug: check the reprojected coordinates
"""
print(df[['x_cart', 'y_cart', 'bbox']].head())
"""
else:
print("The input raster dataset has EPSG:4326. No need to transform coordinates of GBIF occurrences.")
# debug: print headers
"""
print (f"Headers of the intermediate dataframe are: {list(df.columns)}")
"""
# debug: to save reprojected and filtered dataframe to separate csv
"""
df.to_csv(os.path.join(output_dir,'filtered_datacube.csv'), index=False)
"""
"""
# debug: save updated dataframe to new csv
df.to_csv(output_csv_path, index=False)
print(f"Indexed data saved to {output_csv_path}.")
"""
# REDUNDANT - previous version for non-chunked dataframe
"""
# calculate pixel counts
pixel_counts = df[df['bbox']].groupby(['pixel_row', 'pixel_col']).size()
"""
# create output GeoTIFF dataset for writing with a single band (for pixel counts)
driver = gdal.GetDriverByName('GTiff')
output_raster = driver.Create(output_raster_path, raster_ds.RasterXSize, raster_ds.RasterYSize, 1, gdal.GDT_Int16) # create new raster with 1 band
# check if the output raster was created successfully
if output_raster_path is None:
raise Exception(f"Failed to create output raster file: {output_raster_path}")
# set the projection to the CRS of the input raster dataset
srs = osr.SpatialReference()
srs.ImportFromEPSG(int(epsg_code)) # explicitly cast epsg code to integer, otherwise might cause issue
output_raster.SetProjection(srs.ExportToWkt())
# set the GeoTransform to match the original raster
output_raster.SetGeoTransform(raster_geo)
# get the array with counts (only one band needed now)
counts_array = np.zeros((raster_ds.RasterYSize, raster_ds.RasterXSize), dtype=np.int32)
# debug: to print for understanding the type of output dataframe (should be tuple)
"""
print(type(pixel_counts_df))
print(pixel_counts_df.head())
"""
# populate counts_array with pixel counts
for (row, col), count in pixel_counts_df.itertuples():
# ensure row and col are integers
row = int(row)
col = int(col)
# assign count to counts_array
counts_array[row, col] = count
# to exclude occurrences beyond input raster: apply nodata values mask from original raster band to count array
nodata_value = raster_band.GetNoDataValue() # getnodata value from the original raster
original_band_data = raster_band.ReadAsArray() # read original band data to get nodata mask
counts_array[original_band_data == nodata_value] = nodata_value # apply nodata mask
# write counts_array to the first and only band of the output raster
output_band = output_raster.GetRasterBand(1)
output_band.WriteArray(counts_array)
output_band.SetNoDataValue(nodata_value) # set the same nodata values
print(f"Output raster dataset has been written to {output_raster_path}.")
# save and close the output raster
output_band.FlushCache()
output_raster.FlushCache()
output_raster = None # close dataset
# Alternative block - to write the GBIF datacube into a new band while keeping the input data in band 1
"""
# create new GeoTIFF dataset for writing
driver = gdal.GetDriverByName('GTiff')
new_raster = driver.Create(new_raster_path, raster_ds.RasterXSize, raster_ds.RasterYSize, 2, gdal.GDT_Int16) # Create new raster with 2 bands
# check if the new raster was created successfully
if new_raster is None:
raise Exception(f"Failed to create new raster file: {new_raster_path}")
# set the projection to EPSG:25831
srs = osr.SpatialReference()
srs.ImportFromEPSG(25831)
new_raster.SetProjection(srs.ExportToWkt())
# set the GeoTransform to match the original raster
original_band_data = raster_band.ReadAsArray()
new_raster.SetGeoTransform(raster_geo)
# write original band data to the new raster
new_raster.GetRasterBand(1).WriteArray(raster_band.ReadAsArray())
# to set no data value in band 1 of new raster
new_raster.GetRasterBand(1).SetNoDataValue(nodata_value)
# get the array with counts (band 2)
counts_array = np.zeros((raster_ds.RasterYSize, raster_ds.RasterXSize), dtype=np.float32)
# populate counts_array with pixel counts
for (row, col), count in pixel_counts.items():
# ensure row and col are integers
row = int(row)
col = int(col)
# assign count to counts_array
counts_array[row, col] = count
# to exclude occurrences beyond input raster: apply nodata values mask from band 1 to band 2
counts_array[original_band_data == nodata_value] = nodata_value
# TODO - fix - decimal values are not saved in band 1 (all casted to integer)
# write counts_array to band 2 of the new raster
new_band = new_raster.GetRasterBand(2)
new_band.WriteArray(counts_array)
new_band.SetNoDataValue(nodata_value) # set no data value from band 1
# save and close the new raster
new_band.FlushCache()
new_raster.FlushCache()
new_raster = None # close dataset
print(f"Number of rows in each pixel written to {new_raster_path}.")
"""