Disable WMT in preparation for rewrite (#254)

Ryan Sepassi · copybara-github · commit 211cb6f082c5 · 2019-03-20T15:21:09.000-07:00
PiperOrigin-RevId: 239486187
diff --git a/docs/datasets.md b/docs/datasets.md
@@ -65,8 +65,6 @@ np_datasets = tfds.as_numpy(datasets)
   * [`"flores_translate_neen"`](#flores_translate_neen)
   * [`"flores_translate_sien"`](#flores_translate_sien)
   * [`"ted_multi_translate"`](#ted_multi_translate)
-  * [`"wmt_translate_ende"`](#wmt_translate_ende)
-  * [`"wmt_translate_enfr"`](#wmt_translate_enfr)
 * [`video`](#video)
   * [`"bair_robot_pushing_small"`](#bair_robot_pushing_small)
   * [`"moving_mnist"`](#moving_mnist)
@@ -2122,174 +2120,6 @@ VALIDATION |      6,049
 
 ---
 
-### `"wmt_translate_ende"`
-
-Translate dataset based on the data from statmt.org.
-
-
-* URL: [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/)
-* `DatasetBuilder`: [`tfds.translate.wmt_ende.WmtTranslateEnde`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/translate/wmt_ende.py)
-
-`wmt_translate_ende` is configured with `tfds.translate.wmt_ende.WMTConfig` and has the following
-configurations predefined (defaults to the first one):
-
-* `"ende_plain_text_t2t"` (`v0.0.2`) (`Size: 1.60 GiB`): Translation dataset from en to de, uses encoder plain_text. It uses the following data files (see the code for exact contents): {"dev": ["wmt17_newstest13"], "train": ["wmt18_news_commentary_ende", "wmt13_commoncrawl_ende", "wmt13_europarl_ende"]}.
-
-* `"ende_subwords8k_t2t"` (`v0.0.2`) (`Size: 1.60 GiB`): Translation dataset from en to de, uses encoder subwords8k. It uses the following data files (see the code for exact contents): {"dev": ["wmt17_newstest13"], "train": ["wmt18_news_commentary_ende", "wmt13_commoncrawl_ende", "wmt13_europarl_ende"]}.
-
-
-#### `"wmt_translate_ende/ende_plain_text_t2t"`
-
-```python
-Translation({
-    'de': Text(shape=(), dtype=tf.string, encoder=None),
-    'en': Text(shape=(), dtype=tf.string, encoder=None),
-})
-```
-
-
-
-#### `"wmt_translate_ende/ende_subwords8k_t2t"`
-
-```python
-Translation({
-    'de': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8267>),
-    'en': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8216>),
-})
-```
-
-
-
-
-#### Statistics
-Split  | Examples
-:----- | ---:
-ALL        |  4,595,289
-TRAIN      |  4,592,289
-VALIDATION |      3,000
-
-
-#### Urls
- * [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/)
-
-#### Supervised keys (for `as_supervised=True`)
-`(u'en', u'de')`
-
-#### Citation
-```
-@InProceedings{bojar-EtAl:2018:WMT1,
-  author    = {Bojar, Ond{r}ej  and  Federmann, Christian  and  Fishel, Mark
-    and Graham, Yvette  and  Haddow, Barry  and  Huck, Matthias  and
-    Koehn, Philipp  and  Monz, Christof},
-  title     = {Findings of the 2018 Conference on Machine Translation (WMT18)},
-  booktitle = {Proceedings of the Third Conference on Machine Translation,
-    Volume 2: Shared Task Papers},
-  month     = {October},
-  year      = {2018},
-  address   = {Belgium, Brussels},
-  publisher = {Association for Computational Linguistics},
-  pages     = {272--307},
-  url       = {http://www.aclweb.org/anthology/W18-6401}
-}
-```
-
----
-
-### `"wmt_translate_enfr"`
-
-Translate dataset based on the data from statmt.org.
-
-
-* URL: [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/)
-* `DatasetBuilder`: [`tfds.translate.wmt_enfr.WmtTranslateEnfr`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/translate/wmt_enfr.py)
-
-`wmt_translate_enfr` is configured with `tfds.translate.wmt_enfr.WMTConfig` and has the following
-configurations predefined (defaults to the first one):
-
-* `"enfr_plain_text_t2t_small"` (`v0.0.2`) (`Size: ?? GiB`): Translation dataset from en to fr, uses encoder plain_text. It uses the following data files (see the code for exact contents): {"dev": ["opennmt_1M_enfr_valid"], "train": ["opennmt_1M_enfr_train"]}.
-
-* `"enfr_subwords8k_t2t_small"` (`v0.0.2`) (`Size: ?? GiB`): Translation dataset from en to fr, uses encoder subwords8k. It uses the following data files (see the code for exact contents): {"dev": ["opennmt_1M_enfr_valid"], "train": ["opennmt_1M_enfr_train"]}.
-
-* `"enfr_plain_text_t2t_large"` (`v0.0.2`) (`Size: ?? GiB`): Translation dataset from en to fr, uses encoder plain_text. It uses the following data files (see the code for exact contents): {"dev": ["wmt17_newstest13"], "train": ["wmt13_commoncrawl_enfr", "wmt13_europarl_enfr", "wmt14_news_commentary_enfr", "wmt13_undoc_enfr"]}.
-
-* `"enfr_subwords8k_t2t_large"` (`v0.0.2`) (`Size: ?? GiB`): Translation dataset from en to fr, uses encoder subwords8k. It uses the following data files (see the code for exact contents): {"dev": ["wmt17_newstest13"], "train": ["wmt13_commoncrawl_enfr", "wmt13_europarl_enfr", "wmt14_news_commentary_enfr", "wmt13_undoc_enfr"]}.
-
-
-#### `"wmt_translate_enfr/enfr_plain_text_t2t_small"`
-
-```python
-Translation({
-    'en': Text(shape=(), dtype=tf.string, encoder=None),
-    'fr': Text(shape=(), dtype=tf.string, encoder=None),
-})
-```
-
-
-
-#### `"wmt_translate_enfr/enfr_subwords8k_t2t_small"`
-
-```python
-Translation({
-    'en': Text(shape=(), dtype=tf.string, encoder=None),
-    'fr': Text(shape=(), dtype=tf.string, encoder=None),
-})
-```
-
-
-
-#### `"wmt_translate_enfr/enfr_plain_text_t2t_large"`
-
-```python
-Translation({
-    'en': Text(shape=(), dtype=tf.string, encoder=None),
-    'fr': Text(shape=(), dtype=tf.string, encoder=None),
-})
-```
-
-
-
-#### `"wmt_translate_enfr/enfr_subwords8k_t2t_large"`
-
-```python
-Translation({
-    'en': Text(shape=(), dtype=tf.string, encoder=None),
-    'fr': Text(shape=(), dtype=tf.string, encoder=None),
-})
-```
-
-
-
-
-#### Statistics
-None computed
-
-#### Urls
- * [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/)
-
-#### Supervised keys (for `as_supervised=True`)
-`(u'en', u'fr')`
-
-#### Citation
-```
-@InProceedings{bojar-EtAl:2018:WMT1,
-  author    = {Bojar, Ond{r}ej  and  Federmann, Christian  and  Fishel, Mark
-    and Graham, Yvette  and  Haddow, Barry  and  Huck, Matthias  and
-    Koehn, Philipp  and  Monz, Christof},
-  title     = {Findings of the 2018 Conference on Machine Translation (WMT18)},
-  booktitle = {Proceedings of the Third Conference on Machine Translation,
-    Volume 2: Shared Task Papers},
-  month     = {October},
-  year      = {2018},
-  address   = {Belgium, Brussels},
-  publisher = {Association for Computational Linguistics},
-  pages     = {272--307},
-  url       = {http://www.aclweb.org/anthology/W18-6401}
-}
-```
-
----
-
-
 ## [`video`](#video)
 
 ### `"bair_robot_pushing_small"`
diff --git a/tensorflow_datasets/translate/wmt.py b/tensorflow_datasets/translate/wmt.py
@@ -104,6 +104,7 @@ def __init__(self,
 class WmtTranslate(tfds.core.GeneratorBasedBuilder):
   """WMT translation dataset."""
   _URL = "http://www.statmt.org/wmt18/"
+  IN_DEVELOPMENT = True
 
   @abc.abstractproperty
   def translate_datasets(self):
diff --git a/tensorflow_datasets/translate/wmt_ende.py b/tensorflow_datasets/translate/wmt_ende.py
@@ -61,6 +61,7 @@
 
 class WmtTranslateEnde(wmt.WmtTranslate):
   """WMT English-German translation dataset."""
+  IN_DEVELOPMENT = True
 
   BUILDER_CONFIGS = [
       wmt.WMTConfig(
diff --git a/tensorflow_datasets/translate/wmt_enfr.py b/tensorflow_datasets/translate/wmt_enfr.py
@@ -90,6 +90,7 @@
 
 class WmtTranslateEnfr(wmt.WmtTranslate):
   """English-French WMT translation dataset."""
+  IN_DEVELOPMENT = True
 
   BUILDER_CONFIGS = [
       # EN-FR translations (matching the data used by Tensor2Tensor library).