Skip to content

Commit

Permalink
added wmt18 en<>de (#431)
Browse files Browse the repository at this point in the history
* added wmt18 en<>de to autopilot
  • Loading branch information
mjpost authored Jun 12, 2018
1 parent 83468d2 commit 7a1f3f7
Showing 1 changed file with 81 additions and 0 deletions.
81 changes: 81 additions & 0 deletions contrib/autopilot/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@
"http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz",
"fc6b83b809347e64f511d291e4bc8731",
ARCHIVE_TAR),
"news_commentary_v13": RawFile("News Commentary v13",
"http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz",
"07f45ec726e8fc822a8e43606a889e2d",
ARCHIVE_TAR),
"giga_fren_wmt10": RawFile("10^9 French-English corpus",
"http://www.statmt.org/wmt10/training-giga-fren.tar",
"0b12e20027d5b5f0dfcca290c72c8953",
Expand Down Expand Up @@ -104,6 +108,10 @@
"http://opus.nlpl.eu/download.php?f=SETIMES2/en-tr.txt.zip",
"544cec8a631f7820afab6a05451c13a7",
ARCHIVE_ZIP),
"paracrawl_release1_en_de": RawFile("Paracrawl Filtered v1.0",
"https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-de.zipporah0-dedup-clean.tgz",
"30e67e94d111ea675c0567e1c1aa338c",
ARCHIVE_TAR),
# WMT dev and test sets
"wmt14_dev": RawFile("WMT17 development sets",
"http://www.statmt.org/wmt14/dev.tgz",
Expand All @@ -113,6 +121,10 @@
"http://data.statmt.org/wmt17/translation-task/dev.tgz",
"9b1aa63c1cf49dccdd20b962fe313989",
ARCHIVE_TAR),
"wmt18_dev": RawFile("WMT18 development sets",
"http://data.statmt.org/wmt18/translation-task/dev.tgz",
"486f391da54a7a3247f02ebd25996f24",
ARCHIVE_TAR),
"wmt14_test": RawFile("WMT14 test sets",
"http://www.statmt.org/wmt14/test-filtered.tgz",
"84c597844c1542e29c2aff23aaee4310",
Expand All @@ -121,6 +133,10 @@
"http://data.statmt.org/wmt17/translation-task/test.tgz",
"86a1724c276004aa25455ae2a04cef26",
ARCHIVE_TAR),
"wmt18_test": RawFile("WMT18 test sets",
"http://data.statmt.org/wmt18/translation-task/test.tgz",
"f996c245ecffea23d0006fa4c34e9064",
ARCHIVE_TAR),
# Stanford NLP pre-processed data
"stanford_wmt14_train_en": RawFile("Stanford pre-processed WMT14 English training data",
"https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en",
Expand Down Expand Up @@ -517,6 +533,71 @@
"wmt17_test/test/newstest2017-entr-ref.tr.sgm",
TEXT_UTF8_RAW_SGML),
]),
# WMT18 translation tasks
"wmt18_de_en": Task(description="WMT18 German-English news",
url="http://statmt.org/wmt18/translation-task.html",
src_lang="de",
trg_lang="en",
bpe_op=32000,
train=[
("europarl_v7/training/europarl-v7.de-en.de",
"europarl_v7/training/europarl-v7.de-en.en",
TEXT_UTF8_RAW),
("paracrawl_release1_en_de/paracrawl-release1.en-de.zipporah0-dedup-clean.de",
"paracrawl_release1_en_de/paracrawl-release1.en-de.zipporah0-dedup-clean.en",
TEXT_UTF8_RAW),
("common_crawl_wmt13/commoncrawl.de-en.de",
"common_crawl_wmt13/commoncrawl.de-en.en",
TEXT_UTF8_RAW),
("news_commentary_v13/training-parallel-nc-v13/news-commentary-v13.de-en.de",
"news_commentary_v13/training-parallel-nc-v13/news-commentary-v13.de-en.en",
TEXT_UTF8_RAW),
("rapid_eu_2016/rapid2016.de-en.de",
"rapid_eu_2016/rapid2016.de-en.en",
TEXT_UTF8_RAW),
],
dev=[
("wmt18_dev/dev/newstest2017-deen-src.de.sgm",
"wmt18_dev/dev/newstest2017-deen-ref.en.sgm",
TEXT_UTF8_RAW_SGML),
],
test=[
("wmt18_test/test/newstest2018-deen-src.de.sgm",
"wmt18_test/test/newstest2018-deen-ref.en.sgm",
TEXT_UTF8_RAW_SGML),
]),
"wmt18_en_de": Task(description="WMT18 English-German news",
url="http://statmt.org/wmt18/translation-task.html",
src_lang="en",
trg_lang="de",
bpe_op=32000,
train=[
("europarl_v7/training/europarl-v7.de-en.en",
"europarl_v7/training/europarl-v7.de-en.de",
TEXT_UTF8_RAW),
("paracrawl_release1_en_de/paracrawl-release1.en-de.zipporah0-dedup-clean.en",
"paracrawl_release1_en_de/paracrawl-release1.en-de.zipporah0-dedup-clean.de",
TEXT_UTF8_RAW),
("common_crawl_wmt13/commoncrawl.de-en.en",
"common_crawl_wmt13/commoncrawl.de-en.de",
TEXT_UTF8_RAW),
("news_commentary_v13/training-parallel-nc-v13/news-commentary-v13.de-en.en",
"news_commentary_v13/training-parallel-nc-v13/news-commentary-v13.de-en.de",
TEXT_UTF8_RAW),
("rapid_eu_2016/rapid2016.de-en.en",
"rapid_eu_2016/rapid2016.de-en.de",
TEXT_UTF8_RAW),
],
dev=[
("wmt18_dev/dev/newstest2017-ende-src.en.sgm",
"wmt18_dev/dev/newstest2017-ende-ref.de.sgm",
TEXT_UTF8_RAW_SGML),
],
test=[
("wmt18_test/test/newstest2018-ende-src.en.sgm",
"wmt18_test/test/newstest2018-ende-ref.de.sgm",
TEXT_UTF8_RAW_SGML),
]),
# WNMT18 shared task
"wnmt18_en_de": Task(description="WNMT18 English-German (WMT14 news pre-processed)",
url="https://sites.google.com/site/wnmt18/shared-task",
Expand Down

0 comments on commit 7a1f3f7

Please sign in to comment.