-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_bible.py
57 lines (40 loc) · 1.78 KB
/
prepare_bible.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import tarfile
import pandas as pd
from pathlib import Path
from urllib.request import urlretrieve
DATASET_DIR = Path("data/Bible/")
DATASET_URL = (
"https://alt.qcri.org/resources/mt/arabench/releases/current/AraBench_dataset.tgz"
)
DATASET_FILE = str(Path(DATASET_DIR, DATASET_URL.split("/")[-1]))
def download_arabench():
os.makedirs(DATASET_DIR, exist_ok=True)
urlretrieve(DATASET_URL, DATASET_FILE)
tar = tarfile.open(DATASET_FILE, "r:gz")
for item in tar:
tar.extract(item, DATASET_DIR)
def load_file(split, dialect):
assert dialect in ["tn", "ma", "msa"]
base_filename = str(Path(DATASET_DIR, f"AraBench_dataset/bible.{split}."))
filename = base_filename + (
f"mgr.0.{dialect}.ar" if dialect != "msa" else "msa.0.ms.ar"
)
with open(filename, "r") as f:
return [l.strip() for l in f]
def main():
download_arabench()
ma_sentences = load_file("dev", "ma") + load_file("test", "ma")
tn_sentences = load_file("dev", "tn") + load_file("test", "tn")
msa_sentences = load_file("dev", "msa") + load_file("test", "msa")
assert len(ma_sentences) == len(tn_sentences) == len(msa_sentences)
bible_df = pd.DataFrame(
{"ma": ma_sentences, "tn": tn_sentences, "msa": msa_sentences}
)
# Tunisian Bible has chapters numbers as part of the text (Check dev.0 - lines 0, 17, 32, ... ||| test.0 - lines "1174 - 600").
# Some verses have ":" appearing at the start of the line (verse?) which is strange.
# Chapters numbers appear as well (Check dev.0 - line 143, 178).
# Some lines have footnotes(?) starting with # that are not part of the verse (Check dev.9 - lines 11, ...)
bible_df.to_csv(str(Path(DATASET_DIR, "bible.tsv")), sep="\t", index=False)
if __name__ == "__main__":
main()