diff --git a/systran_storages/storage.py b/systran_storages/storage.py index e13405f..cb9c5b7 100644 --- a/systran_storages/storage.py +++ b/systran_storages/storage.py @@ -196,6 +196,13 @@ def partition_auto(self, data, training_path, testing_path, remote_path, storage client, remote_path = self._get_storage(remote_path, storage_id=storage_id) return client.partition_auto(data, training_path, testing_path, partition_value, is_percent, lp) + def partition_from_selected_corpus(self, corpus_id, account_id, training_file, testing_file, + remote_path, storage_id, partition_value, is_percent): + LOGGER.info('Partitioning corpus from selected corpus to %s', remote_path) + client, remote_path = self._get_storage(remote_path, storage_id=storage_id) + return client.partition_from_selected_corpus(corpus_id, account_id, training_file, testing_file, + partition_value, is_percent) + def mkdir(self, local_path, remote_path, storage_id=None): """Pushes a local_path file or directory to storage.""" LOGGER.info('mkdir %s to %s', local_path, remote_path) diff --git a/systran_storages/storages/corpus.py b/systran_storages/storages/corpus.py index 3bea07a..b9d1f28 100644 --- a/systran_storages/storages/corpus.py +++ b/systran_storages/storages/corpus.py @@ -505,6 +505,42 @@ def partition_auto(self, local_path, training_path, testing_path, partition_valu raise ValueError("Cannot import file '%s' in '%s'." % (local_path, remote_path)) return response.json() + def partition_from_selected_corpus(self, corpus_id, account_id, training_file, testing_file, + partition_value, is_percent): + data_partition = { + 'partition': [], + 'readOnlyAccountId': self.account_id, + 'accountId': account_id, + 'id': corpus_id + } + if is_percent: + data_partition['usePercentage'] = True + if partition_value == 100: + data_partition['partition'] = [ + {'segments': str(partition_value), 'filename': str(testing_file)} + ] + elif partition_value == 0: + data_partition['partition'] = [ + {'segments': str(100 - partition_value), 'filename': str(training_file)} + ] + else: + data_partition['partition'] = [ + {'segments': str(100 - partition_value), 'filename': str(training_file)}, + {'segments': str(partition_value), 'filename': str(testing_file)} + ] + else: + data_partition['partition'] = [ + {'segments': 'remains', 'filename': str(training_file)}, + {'segments': str(partition_value), 'filename': str(testing_file)} + ] + response = requests.post(self.host_url + '/corpus/partition', json=data_partition) + if response.status_code != 200: + error_message = json.loads(response.content).get('error') + if error_message: + raise ValueError("Cannot partition corpus : %s" % error_message) + raise ValueError("Cannot partition corpus") + return response.json() + def _create_path_from_root(self, remote_path): """ Remove the extension (.json or .{lang}) of the remote path if this path was generated from the corpus in the corpus manager storage.