-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintake_data.py
106 lines (97 loc) · 4.58 KB
/
intake_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# %%
import pandas as pd
import numpy as np
import pathlib
import re
import logging
import datetime
import utils
# %%
# the local path to the folder with all the files in it
data_dir = "pollen_slides"
database_name = "database.csv"
# This will get turned into a pandas dataframe after all the files are indexed added to it
database = {
"species": [],
"date": [], # the date the image was captured
"path": [], # the local path to the image,
"slide_id": [],
"image_location": [],
"image_depth": [],
"image_magnification": [],
"herbarium_specimen_id": [],
}
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
## %%
match_date = re.compile(
"\d{1,2}-\d{1,2}-\d{2}"
) # this regex matches dates in the format "mm-dd-yy"
# Loop through all the images in the data folder
for f in pathlib.Path(data_dir).glob("**/*.*"):
# Filter for images
if f.suffix.lower() in utils.img_suffixes:
# Split the path into a list of folders
folders = f.parent.parts
try:
# Ignore any images that are old (they have a yellow discoloration)
assert folders[-1] != "Old", (
"debug",
f"Skipping {folders[1]} {f.name} because its in an 'Old' folder",
)
# Make sure path is the correct length
assert len(folders) == 3, ("warn", f"Invalid path '{f}'")
# Make sure the path includes exactly one date in the expected location
date = match_date.findall(folders[2])
assert len(date) == 1, ("warn", f"Couldn't extract date from path '{f}'")
processed_date = pd.to_datetime(date[0])
database["species"].append(folders[1])
database["date"].append(processed_date)
database["path"].append(str(f))
# If the image is older than 2019, it has a different naming convention
if processed_date.date() < datetime.date(year=2019, month=1, day=1):
# Make sure the image name is in the expected format
# A few images have a different naming convention (ex: CF071515 10X.JPG)
# But these are also a weird color, so I think its okay to ignore them
name_segments = f.stem.split(" ")
assert len(name_segments) >= 2, ("warn", f"Invalid name for pre-2019 image '{f.name}'")
try:
database["image_location"].append(name_segments[1][3])
except:
database["image_location"].append(-1)
try:
database["image_depth"].append(name_segments[1][4])
except:
database["image_depth"].append('')
# Multiply magnification by 10 to match with post-2019 images
database["image_magnification"].append(int(name_segments[1][:2]))
database["herbarium_specimen_id"].append("")
database["slide_id"].append("S0")
# Pre Oct 20th, 2022 (same as after that date but it doesn't include the slide ID and magnification is multiplied by 10)
elif processed_date.date() < datetime.date(year=2022, month=10, day=20):
name_segments = f.stem.split("_")
assert len(name_segments) == 6, ("warn", f"Invalid name for post-2019 pre-oct-2022 image '{f.name}'")
database["image_location"].append(name_segments[4][:2])
database["image_depth"].append(name_segments[4][2])
database["image_magnification"].append(int(name_segments[3][3:6]) / 10)
database["herbarium_specimen_id"].append(int(name_segments[0]))
database["slide_id"].append("S0")
else:
name_segments = f.stem.split("_")
assert len(name_segments) == 6, ("warn", f"Invalid name for post-oct-2022 image '{f.name}'")
database["image_location"].append(name_segments[5][:2])
database["image_depth"].append(name_segments[5][2])
database["image_magnification"].append(int(name_segments[4][:2]))
database["herbarium_specimen_id"].append(int(name_segments[1]))
database["slide_id"].append(name_segments[0])
except AssertionError as e:
logging_level = e.args[0][0]
logging_message = e.args[0][1]
if logging_level == "debug":
logging.debug(logging_message)
elif logging_level == "warn":
logging.warning(logging_message)
df = pd.DataFrame(database)
# %%
df.to_csv(pathlib.Path(data_dir) / database_name,index=False)
# %%