-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathextract_data.py
44 lines (33 loc) · 1.42 KB
/
extract_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import tarfile
import requests
from tqdm import tqdm
#References: https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/datasets/multi30k.html
urls=[
'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz',
'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz',
'https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt16_task1_test.tar.gz'
]
path = "dataset/"
filenames = ["mmt16_task1_test.tar.gz", "training.tar.gz", "validation.tar.gz"]
def download(urls, path, filenames):
for _, (url, filename) in enumerate(zip(urls, filenames)):
resp = requests.get(url, stream=True)
total = int(resp.headers.get('content-length', 0))
with open(path + filename, 'wb') as file, tqdm(
desc = f'downloading {filename = } to {path = }',
total = total,
unit = 'iB',
unit_scale = True,
unit_divisor = 1024,
) as bar:
for data in resp.iter_content(chunk_size = 1024):
size = file.write(data)
bar.update(size)
download(urls, path, filenames)
def extract(path, filenames):
for filename in filenames:
tar = tarfile.open(path + filename)
tar.extractall(path)
tar.close()
print(f'Extracted {filename}')
extract(path, filenames)