-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_dataset.py
80 lines (67 loc) · 2.27 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Program to retrieve all assets from the page as a json dataset
import json
from fractions import Fraction
import re
import requests as req
import os
md_file_url = "https://github.com/dpapathanasiou/recipes/raw/master/index/c/cookies.md"
json_file_url_prefix = "https://github.com/dpapathanasiou/recipes/raw/master/index"
file_name = "/data/recipes.json"
response = req.get(md_file_url)
content = response.text
# Extract the url information from the main page
json_urls = []
pattern = r'\]\(([^)]+)\)'
matches = re.findall(pattern, content)
cleaned_links = [match.replace("../../index/", "") for match in matches]
for clean_link in cleaned_links:
json_url = f"{json_file_url_prefix}/{clean_link}"
json_urls.append(json_url)
def parse_mixed_fraction(mixed_fraction):
if ' ' in mixed_fraction:
# print(mixed_fraction)
whole_part, fraction_part = mixed_fraction.split(' ')
fraction = Fraction(fraction_part)
return str(int(whole_part) + fraction)
else:
return mixed_fraction
recipes_data = {
"recipes":[]
}
for url in json_urls:
final_ingrs = []
# get recipe from url
res = req.get(url)
data = res.json()
# get name and ingridients from json
name = data["title"]
ingr = data["ingredients"]
# Reformat the ingridients to required format
for ing in ingr:
# pattern = r'^([\d/]+)\s*([a-zA-Z]+)\s*(?:\([^)]+\))?\s*([^,]*)'
pattern = r'^([\d\s/]+)\s*([a-zA-Z]+)\s*(?:\([^)]+\))?\s*([^,]*)'
match = re.match(pattern, ing)
if match:
if match.group(3).strip() == "":
new_ingr = {
"amount": parse_mixed_fraction(match.group(1).strip()),
"unit": "",
"ingridient":match.group(2).strip()
}
else:
new_ingr = {
"amount": parse_mixed_fraction(match.group(1).strip()),
"unit": match.group(2),
"ingridient":match.group(3).strip()
}
final_ingrs.append(new_ingr)
recipe = {
"title":name,
"ingridients":final_ingrs
}
recipes_data["recipes"].append(recipe)
with open(file_name, "w") as outfile:
json.dump(recipes_data, outfile, indent=2)
print("Dataset created successfully")
# final_json_data = json.dumps(recipes_data,indent=2)
# print(final_json_data)