diff --git a/tensorflow_io/BUILD b/tensorflow_io/BUILD index 6373f11e3..29b1cf042 100644 --- a/tensorflow_io/BUILD +++ b/tensorflow_io/BUILD @@ -15,6 +15,7 @@ cc_binary( "//tensorflow_io/core:bigtable_ops", "//tensorflow_io/core:audio_video_ops", "//tensorflow_io/core:avro_ops", + "//tensorflow_io/core:avro_atds", "//tensorflow_io/core:orc_ops", "//tensorflow_io/core:cpuinfo", "//tensorflow_io/core:file_ops", diff --git a/tensorflow_io/core/BUILD b/tensorflow_io/core/BUILD index e68c817a2..04c8ccac6 100644 --- a/tensorflow_io/core/BUILD +++ b/tensorflow_io/core/BUILD @@ -484,6 +484,61 @@ cc_library( alwayslink = 1, ) +cc_library( + name = "avro_atds", + srcs = [ + "kernels/avro/atds/atds_decoder.cc", + "kernels/avro/atds/errors.cc", + "kernels/avro/atds_dataset_kernels.cc", + ], + hdrs = [ + "kernels/avro/atds/atds_decoder.h", + "kernels/avro/atds/avro_block_reader.h", + "kernels/avro/atds/avro_decoder_template.h", + "kernels/avro/atds/decoder_base.h", + "kernels/avro/atds/decompression_handler.h", + "kernels/avro/atds/dense_feature_decoder.h", + "kernels/avro/atds/errors.h", + "kernels/avro/atds/opaque_contextual_feature_decoder.h", + "kernels/avro/atds/shuffle_handler.h", + "kernels/avro/atds/sparse_feature_decoder.h", + "kernels/avro/atds/sparse_feature_internal_decoder.h", + "kernels/avro/atds/sparse_value_buffer.h", + "kernels/avro/atds/varlen_feature_decoder.h", + "kernels/avro/atds_dataset_kernels.h", + ], + copts = tf_io_copts(), + linkstatic = True, + deps = [ + ":avro_ops", + "@avro", + "@local_config_tf//:libtensorflow_framework", + "@local_config_tf//:tf_header_lib", + ], + alwayslink = 1, +) + +cc_library( + name = "avro_atds_tests", + srcs = [ + "kernels/avro/atds/atds_decoder_test.cc", + "kernels/avro/atds/avro_block_reader_test.cc", + "kernels/avro/atds/decoder_test_util.cc", + "kernels/avro/atds/decoder_test_util.h", + "kernels/avro/atds/dense_feature_decoder_test.cc", + "kernels/avro/atds/shuffle_handler_test.cc", + "kernels/avro/atds/sparse_feature_decoder_test.cc", + "kernels/avro/atds/sparse_value_buffer_test.cc", + "kernels/avro/atds/varlen_feature_decoder_test.cc", + ], + copts = tf_io_copts(), + deps = [ + ":avro_atds", + "//tensorflow_io/core:avro_ops", + "@com_google_googletest//:gtest_main", + ], +) + cc_library( name = "orc_ops", srcs = [ diff --git a/tensorflow_io/core/kernels/avro/atds/atds_decoder.cc b/tensorflow_io/core/kernels/avro/atds/atds_decoder.cc new file mode 100644 index 000000000..997c417a3 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/atds_decoder.cc @@ -0,0 +1,82 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/atds_decoder.h" + +#include "api/Generic.hh" +#include "api/Specific.hh" +#include "tensorflow_io/core/kernels/avro/atds/dense_feature_decoder.h" +#include "tensorflow_io/core/kernels/avro/atds/errors.h" +#include "tensorflow_io/core/kernels/avro/atds/opaque_contextual_feature_decoder.h" +#include "tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder.h" +#include "tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder.h" + +namespace tensorflow { +namespace atds { + +Status ATDSDecoder::Initialize(const avro::ValidSchema& schema) { + auto& root_node = schema.root(); + if (root_node->type() != avro::AVRO_RECORD) { + return ATDSNotRecordError(avro::toString(root_node->type()), + schema.toJson()); + } + + size_t num_of_columns = root_node->leaves(); + feature_names_.resize(num_of_columns, ""); + decoder_types_.resize(num_of_columns, FeatureType::opaque_contextual); + decoders_.resize(num_of_columns); + + for (size_t i = 0; i < dense_features_.size(); i++) { + TF_RETURN_IF_ERROR( + InitializeFeatureDecoder(schema, root_node, dense_features_[i])); + } + + for (size_t i = 0; i < sparse_features_.size(); i++) { + TF_RETURN_IF_ERROR( + InitializeFeatureDecoder(schema, root_node, sparse_features_[i])); + } + + for (size_t i = 0; i < varlen_features_.size(); i++) { + TF_RETURN_IF_ERROR( + InitializeFeatureDecoder(schema, root_node, varlen_features_[i])); + } + + size_t opaque_contextual_index = 0; + for (size_t i = 0; i < num_of_columns; i++) { + if (decoder_types_[i] == FeatureType::opaque_contextual) { + decoders_[i] = std::unique_ptr( + new opaque_contextual::FeatureDecoder(opaque_contextual_index++)); + + auto& opaque_contextual_node = root_node->leafAt(i); + skipped_data_.emplace_back(opaque_contextual_node); + if (opaque_contextual_node->hasName()) { + feature_names_[i] = root_node->leafAt(i)->name(); + LOG(WARNING) << "Column '" << feature_names_[i] << "' from input data" + << " is not used. Cost of parsing an unused column is " + "prohibitive!! " + << "Consider dropping it to improve I/O performance."; + } + } + } + + // Decoder requires unvaried schema in all input files. + // Copy the schema to validate other input files. + schema_ = schema; + + return OkStatus(); +} + +} // namespace atds +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/atds_decoder.h b/tensorflow_io/core/kernels/avro/atds/atds_decoder.h new file mode 100644 index 000000000..8c7c907e6 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/atds_decoder.h @@ -0,0 +1,150 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_H_ + +#include "api/Decoder.hh" +#include "api/GenericDatum.hh" +#include "api/ValidSchema.hh" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_base.h" +#include "tensorflow_io/core/kernels/avro/atds/dense_feature_decoder.h" +#include "tensorflow_io/core/kernels/avro/atds/errors.h" +#include "tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder.h" +#include "tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder.h" + +namespace tensorflow { +namespace atds { + +class NullableFeatureDecoder : public DecoderBase { + public: + explicit NullableFeatureDecoder(std::unique_ptr& decoder, + size_t non_null_index) + : decoder_(std::move(decoder)), non_null_index_(non_null_index) {} + + Status operator()(avro::DecoderPtr& decoder, + std::vector& dense_tensors, + sparse::ValueBuffer& buffer, + std::vector& skipped_data, + size_t offset) { + auto index = decoder->decodeUnionIndex(); + if (index != non_null_index_) { + return NullValueError(); + } + return decoder_->operator()(decoder, dense_tensors, buffer, skipped_data, + offset); + } + + private: + std::unique_ptr decoder_; + const size_t non_null_index_; +}; + +class ATDSDecoder { + public: + explicit ATDSDecoder(const std::vector& dense_features, + const std::vector& sparse_features, + const std::vector& varlen_features) + : dense_features_(dense_features), + sparse_features_(sparse_features), + varlen_features_(varlen_features) {} + + Status Initialize(const avro::ValidSchema&); + + Status DecodeATDSDatum(avro::DecoderPtr& decoder, + std::vector& dense_tensors, + sparse::ValueBuffer& buffer, + std::vector& skipped_data, + size_t offset) { + // LOG(INFO) << "Decode atds from offset: " << offset; + for (size_t i = 0; i < decoders_.size(); i++) { + Status status = decoders_[i]->operator()(decoder, dense_tensors, buffer, + skipped_data, offset); + if (TF_PREDICT_FALSE(!status.ok())) { + return FeatureDecodeError(feature_names_[i], status.error_message()); + } + } + // LOG(INFO) << "Decode atds from offset Done: " << offset; + return OkStatus(); + } + + const std::vector& GetSkippedData() { + return skipped_data_; + } + + const avro::ValidSchema& GetSchema() { return schema_; } + + private: + template + Status InitializeFeatureDecoder(const avro::ValidSchema& schema, + const avro::NodePtr& root_node, + const Metadata& metadata) { + size_t pos; + if (!root_node->nameIndex(metadata.name, pos)) { + return FeatureNotFoundError(metadata.name, schema.toJson()); + } + decoder_types_[pos] = metadata.type; + feature_names_[pos] = metadata.name; + + auto& feature_node = root_node->leafAt(pos); + if (feature_node->type() == avro::AVRO_UNION) { + size_t non_null_index = 0; + size_t num_union_types = feature_node->leaves(); + + if (num_union_types == 2 && + feature_node->leafAt(0)->type() == avro::AVRO_NULL) { + non_null_index = 1; + } + + if (num_union_types == 1 || num_union_types == 2) { + auto& non_null_feature_node = feature_node->leafAt(non_null_index); + TF_RETURN_IF_ERROR(ValidateSchema(non_null_feature_node, metadata)); + std::unique_ptr decoder_base = + CreateFeatureDecoder(non_null_feature_node, metadata); + decoders_[pos] = std::unique_ptr( + new NullableFeatureDecoder(decoder_base, non_null_index)); + } else { + std::ostringstream oss; + feature_node->printJson(oss, 0); + return InvalidUnionTypeError(metadata.name, oss.str()); + } + } else { + TF_RETURN_IF_ERROR(ValidateSchema(feature_node, metadata)); + decoders_[pos] = CreateFeatureDecoder(feature_node, metadata); + } + + return OkStatus(); + } + + const std::vector& dense_features_; + const std::vector& sparse_features_; + const std::vector& varlen_features_; + + std::vector feature_names_; + std::vector> decoders_; + std::vector decoder_types_; + + std::vector skipped_data_; + avro::ValidSchema schema_; +}; + +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/atds_decoder_test.cc b/tensorflow_io/core/kernels/avro/atds/atds_decoder_test.cc new file mode 100644 index 000000000..61e735974 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/atds_decoder_test.cc @@ -0,0 +1,147 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/atds_decoder.h" + +#include "api/Decoder.hh" +#include "api/GenericDatum.hh" +#include "api/Stream.hh" +#include "api/ValidSchema.hh" +#include "tensorflow/core/platform/test.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_test_util.h" +#include "tensorflow_io/core/kernels/avro/atds/dense_feature_decoder.h" + +namespace tensorflow { +namespace atds { + +TEST(ATDSDecoder, TestMixedFeatures) { + std::vector feature_names = { + "dense_float_1d", "dense_long_2d", "unused_dense", + "sparse_int_1d", "unsed_sparse", "sparse_string_2d", + "unused_varlen", "varlen_bool_1d", "varlen_string_2d"}; + std::vector feature_pos = {0, 1, 0, 1, 2, 3}; + std::vector> feature_shapes = { + {3}, {2, 2}, {101}, {6, 10}, {-1}, {-1, -1}}; + std::vector tensor_shapes; + for (auto shape : feature_shapes) { + tensor_shapes.emplace_back(shape); + } + + ATDSSchemaBuilder schema_builder = ATDSSchemaBuilder(); + schema_builder.AddDenseFeature(feature_names[0], DT_FLOAT, 1) + .AddDenseFeature(feature_names[1], DT_INT64, 2) + .AddDenseFeature(feature_names[2], DT_FLOAT, 2) // unused + .AddSparseFeature(feature_names[3], DT_INT32, 1) + .AddSparseFeature(feature_names[4], DT_DOUBLE, 1) // unused + .AddSparseFeature(feature_names[5], DT_STRING, 2) + .AddDenseFeature(feature_names[6], DT_BOOL, 0) // unused + .AddDenseFeature(feature_names[7], DT_BOOL, 1) + .AddDenseFeature(feature_names[8], DT_STRING, 2); + + string schema = schema_builder.Build(); + avro::ValidSchema writer_schema = schema_builder.BuildVaildSchema(); + + avro::GenericDatum atds_datum(writer_schema); + std::vector dense_float_1d = {1.0, 2.0, 3.0}; + std::vector> dense_long_2d = {{1, 3}, {2, 4}}; + std::vector> unused_dense = {{1.0, 2.0}}; + + std::vector> sparse_int_1d_indices = {{100}}; + std::vector sparse_int_1d_values = {100}; + std::vector> sparse_string_2d_indices = {{5, 5}, {4, 8}}; + std::vector sparse_string_2d_values = {"TensorFlow", "Linkedin"}; + std::vector> unsed_sparse_indices = {{0, 1}}; + std::vector unsed_sparse_values = {1.0, -1.0}; + + std::vector varlen_bool_1d = {true, false, true}; + std::vector> varlen_string_2d = {{"ABC"}, {}, {"DEF"}}; + std::vector expected_varlen_string_2d_values = {"ABC", "DEF"}; + bool unused_varlen = true; + + AddDenseValue(atds_datum, feature_names[0], dense_float_1d); + AddDenseValue(atds_datum, feature_names[1], dense_long_2d); + AddDenseValue(atds_datum, feature_names[2], unused_dense); + AddSparseValue(atds_datum, feature_names[3], sparse_int_1d_indices, + sparse_int_1d_values); + AddSparseValue(atds_datum, feature_names[4], unsed_sparse_indices, + unsed_sparse_values); + AddSparseValue(atds_datum, feature_names[5], sparse_string_2d_indices, + sparse_string_2d_values); + AddDenseValue(atds_datum, feature_names[6], unused_varlen); + AddDenseValue(atds_datum, feature_names[7], varlen_bool_1d); + AddDenseValue(atds_datum, feature_names[8], varlen_string_2d); + + avro::OutputStreamPtr out_stream = EncodeAvroGenericDatum(atds_datum); + avro::InputStreamPtr in_stream = avro::memoryInputStream(*out_stream); + avro::DecoderPtr decoder = avro::binaryDecoder(); + decoder->init(*in_stream); + + std::vector dense_features; + dense_features.emplace_back(FeatureType::dense, feature_names[0], DT_FLOAT, + tensor_shapes[0], feature_pos[0]); + dense_features.emplace_back(FeatureType::dense, feature_names[1], DT_INT64, + tensor_shapes[1], feature_pos[1]); + + size_t values_index = 0; + std::vector sparse_features; + sparse_features.emplace_back(FeatureType::sparse, feature_names[3], DT_INT32, + tensor_shapes[2], feature_pos[2], values_index); + sparse_features.emplace_back(FeatureType::sparse, feature_names[5], DT_STRING, + tensor_shapes[3], feature_pos[3], values_index); + + std::vector varlen_features; + size_t string_value_index = 1; // index 0 is used by sparse_string_2d. + varlen_features.emplace_back(FeatureType::varlen, feature_names[7], DT_BOOL, + tensor_shapes[4], feature_pos[4], values_index); + varlen_features.emplace_back(FeatureType::varlen, feature_names[8], DT_STRING, + tensor_shapes[5], feature_pos[5], + string_value_index); + + ATDSDecoder atds_decoder = + ATDSDecoder(dense_features, sparse_features, varlen_features); + Status init_status = atds_decoder.Initialize(writer_schema); + ASSERT_TRUE(init_status.ok()); + + std::vector dense_tensors; + dense_tensors.emplace_back(DT_FLOAT, TensorShape(feature_shapes[0])); + dense_tensors.emplace_back(DT_INT64, TensorShape(feature_shapes[1])); + + sparse::ValueBuffer buffer; + buffer.indices.resize(4); + buffer.num_of_elements.resize(4); + buffer.string_values.resize(2); + buffer.int_values.resize(1); + buffer.bool_values.resize(1); + + std::vector skipped_data = atds_decoder.GetSkippedData(); + long offset = 0L; + Status decode_status = + atds_decoder.DecodeATDSDatum(decoder, dense_tensors, buffer, skipped_data, + static_cast(offset)); + ASSERT_TRUE(decode_status.ok()); + AssertTensorValues(dense_tensors[0], dense_float_1d); + AssertTensorValues(dense_tensors[1], dense_long_2d); + ValidateBuffer(buffer, sparse_features[0], {offset, 100}, + sparse_int_1d_values, {1}); + ValidateBuffer(buffer, sparse_features[1], {offset, 5, 4, offset, 5, 8}, + sparse_string_2d_values, {2}); + ValidateBuffer(buffer, varlen_features[0], {offset, 0, offset, 1, offset, 2}, + varlen_bool_1d, {3}); + ValidateBuffer(buffer, varlen_features[1], {offset, 0, 0, offset, 2, 0}, + expected_varlen_string_2d_values, {2}); +} + +} // namespace atds +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/avro_block_reader.h b/tensorflow_io/core/kernels/avro/atds/avro_block_reader.h new file mode 100644 index 000000000..10dad8605 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/avro_block_reader.h @@ -0,0 +1,225 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_AVRO_BLOCK_READER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_AVRO_BLOCK_READER_H_ + +#include +#include + +#include "api/Compiler.hh" +#include "api/DataFile.hh" +#include "api/Decoder.hh" +#include "api/Specific.hh" +#include "api/Stream.hh" +#include "api/ValidSchema.hh" +#include "tensorflow/core/lib/io/random_inputstream.h" + +namespace tensorflow { +namespace data { + +struct AvroBlock { + int64_t object_count; + int64_t num_to_decode; + int64_t num_decoded; + int64_t byte_count; + int64_t counts; + tstring content; + avro::Codec codec; + size_t read_offset; +}; + +class FileBufferInputStream : public avro::InputStream { + public: + FileBufferInputStream(tensorflow::RandomAccessFile* file, int64 buffer_size) + : reader_(nullptr), + limit_(0), + pos_(0), + count_(0), + skip_(0), + buffer_size_(buffer_size) { + reader_ = absl::make_unique(file); + } + + bool next(const uint8_t** data, size_t* len) override { + while (pos_ == limit_) { + if (skip_ > 0) { + reader_->SkipNBytes(static_cast(skip_)); + skip_ = 0; + } + + buf_.clear(); + Status status = reader_->ReadNBytes(buffer_size_, &buf_); + pos_ = 0; + limit_ = buf_.size(); + if (limit_ == 0 && errors::IsOutOfRange(status)) { + return false; + } + } + + if (*len == 0 || pos_ + *len > limit_) { + *len = limit_ - pos_; + } + + *data = reinterpret_cast(buf_.data()) + pos_; + pos_ += *len; + count_ += *len; + + return *len != 0; + } + + void backup(size_t len) override { + pos_ -= len; + count_ -= len; + } + + void skip(size_t len) override { + if (pos_ + len > limit_) { + skip_ = pos_ + len - limit_; + pos_ = limit_; + } else { + pos_ += len; + } + + count_ += len; + } + + size_t byteCount() const override { return count_; } + + private: + std::unique_ptr reader_; + size_t limit_, pos_, count_, skip_; + const int64 buffer_size_; + tstring buf_; +}; + +constexpr const char* const AVRO_SCHEMA_KEY = "avro.schema"; +constexpr const char* const AVRO_CODEC_KEY = "avro.codec"; +constexpr const char* const AVRO_NULL_CODEC = "null"; +constexpr const char* const AVRO_DEFLATE_CODEC = "deflate"; +constexpr const char* const AVRO_SNAPPY_CODEC = "snappy"; + +using Magic = std::array; +static const Magic magic = {{'O', 'b', 'j', '\x01'}}; + +using AvroMetadata = std::map>; + +class AvroBlockReader { + public: + AvroBlockReader(tensorflow::RandomAccessFile* file, int64 buffer_size) + : stream_(nullptr), decoder_(nullptr) { + stream_ = std::make_unique(file, buffer_size); + decoder_ = avro::binaryDecoder(); + ReadHeader(); + } + + const avro::ValidSchema& GetSchema() { return data_schema_; } + + Status ReadBlock(AvroBlock& block) { + decoder_->init(*stream_); + const uint8_t* p = 0; + size_t n = 0; + if (!stream_->next(&p, &n)) { + return errors::OutOfRange("eof"); + } + stream_->backup(n); + + avro::decode(*decoder_, block.object_count); + // LOG(INFO) << "block object counts = " << block.object_count; + avro::decode(*decoder_, block.byte_count); + // LOG(INFO) << "block bytes counts = " << block.byte_count; + block.content.reserve(block.byte_count); + + decoder_->init(*stream_); + int64_t remaining_bytes = block.byte_count; + while (remaining_bytes > 0) { + const uint8_t* data; + size_t len = remaining_bytes; + if (!stream_->next(&data, &len)) { + return errors::OutOfRange("eof"); + } + block.content.append(reinterpret_cast(data), len); + remaining_bytes -= len; + } + // LOG(INFO) << "block content = " << block.content; + block.codec = codec_; + block.read_offset = 0; + block.num_decoded = 0; + block.num_to_decode = 0; + decoder_->init(*stream_); + avro::DataFileSync sync_marker; + avro::decode(*decoder_, sync_marker); + if (sync_marker != sync_marker_) { + return errors::DataLoss("Avro sync marker mismatch."); + } + + return OkStatus(); + } + + private: + void ReadHeader() { + decoder_->init(*stream_); + Magic m; + avro::decode(*decoder_, m); + if (magic != m) { + throw avro::Exception("Invalid data file. Magic does not match."); + } + avro::decode(*decoder_, metadata_); + AvroMetadata::const_iterator it = metadata_.find(AVRO_SCHEMA_KEY); + if (it == metadata_.end()) { + throw avro::Exception("No schema in metadata"); + } + + string schema = std::string( + reinterpret_cast(it->second.data()), it->second.size()); + // LOG(INFO) << schema; + std::istringstream iss(schema); + avro::compileJsonSchema(iss, data_schema_); + + it = metadata_.find(AVRO_CODEC_KEY); + if (it != metadata_.end()) { + size_t length = it->second.size(); + const char* codec = reinterpret_cast(it->second.data()); + // LOG(INFO) << "Codec = " << std::string(codec, length); + if (strncmp(codec, AVRO_DEFLATE_CODEC, length) == 0) { + codec_ = avro::DEFLATE_CODEC; + } else if (strncmp(codec, AVRO_SNAPPY_CODEC, length) == 0) { + codec_ = avro::SNAPPY_CODEC; + } else if (strncmp(codec, AVRO_NULL_CODEC, length) == 0) { + codec_ = avro::NULL_CODEC; + } else { + throw avro::Exception("Unknown codec in data file: " + + std::string(codec, it->second.size())); + } + } else { + codec_ = avro::NULL_CODEC; + } + + avro::decode(*decoder_, sync_marker_); + } + + AvroMetadata metadata_; + avro::DataFileSync sync_marker_; + avro::Codec codec_; + + std::unique_ptr stream_; + avro::DecoderPtr decoder_; + avro::ValidSchema data_schema_; +}; + +} // namespace data +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_AVRO_BLOCK_READER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/avro_block_reader_test.cc b/tensorflow_io/core/kernels/avro/atds/avro_block_reader_test.cc new file mode 100644 index 000000000..e6d0b5b85 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/avro_block_reader_test.cc @@ -0,0 +1,625 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/avro_block_reader.h" + +#include "absl/memory/memory.h" +#include "tensorflow/core/platform/file_system.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_test_util.h" +//#include "tensorflow/tsl/platform/default/posix_file_system.h" + +#include +#include + +#include "api/DataFile.hh" +#include "api/Generic.hh" +#include "api/GenericDatum.hh" +#include "api/Stream.hh" + +namespace tensorflow { +namespace data { + +class MockRandomAccessFile : public RandomAccessFile { + public: + explicit MockRandomAccessFile(char* content, size_t len) + : content_(content), len_(len) {} + + Status Read(uint64 offset, size_t n, StringPiece* result, + char* scratch) const override { + size_t bytes_to_copy = std::min(n, len_ - static_cast(offset)); + memcpy(scratch, content_ + offset, bytes_to_copy); + *result = StringPiece(scratch, bytes_to_copy); + if (bytes_to_copy == n) { + return OkStatus(); + } + return Status(tensorflow::error::Code::OUT_OF_RANGE, "eof"); + } + + private: + const char* content_; + size_t len_; +}; + +TEST(FileBufferInputStreamTest, SINGLE_BUFFER) { + char content[8]; + for (size_t i = 0; i < 8; i++) { + content[i] = '0' + i; + } + std::unique_ptr raf = + absl::make_unique(content, 8); + int64 buffer_size = 8; + FileBufferInputStream stream(raf.get(), buffer_size); + const uint8_t* data; + size_t len = 4; + ASSERT_TRUE(stream.next(&data, &len)); + ASSERT_EQ(4, len); + ASSERT_EQ(4, stream.byteCount()); + tensorflow::atds::AssertValueEqual("0123", (char*)data, len); + + stream.skip(1); + len = 3; + stream.next(&data, &len); + ASSERT_EQ(3, len); + ASSERT_EQ(8, stream.byteCount()); + tensorflow::atds::AssertValueEqual("567", (char*)data, len); + + stream.backup(5); + len = 3; + stream.next(&data, &len); + ASSERT_EQ(3, len); + ASSERT_EQ(6, stream.byteCount()); + tensorflow::atds::AssertValueEqual("345", (char*)data, len); +} + +TEST(FileBufferInputStreamTest, READ_PAST_BUFFER) { + char content[16]; + for (size_t i = 0; i < 16; i++) { + content[i] = 'a' + i; + } + std::unique_ptr raf = + absl::make_unique(content, 16); + int64 buffer_size = 8; + FileBufferInputStream stream(raf.get(), buffer_size); + const uint8_t* data; + size_t len = 3; + ASSERT_TRUE(stream.next(&data, &len)); + ASSERT_EQ(3, len); + ASSERT_EQ(3, stream.byteCount()); + tensorflow::atds::AssertValueEqual("abc", (char*)data, len); + + len = 7; + stream.next(&data, &len); + ASSERT_EQ(5, len); + ASSERT_EQ(8, stream.byteCount()); + tensorflow::atds::AssertValueEqual("defgh", (char*)data, len); + + len = 4; + stream.next(&data, &len); + ASSERT_EQ(4, len); + ASSERT_EQ(12, stream.byteCount()); + tensorflow::atds::AssertValueEqual("ijkl", (char*)data, len); +} + +TEST(FileBufferInputStreamTest, SKIP_PAST_BUFFER) { + char content[16]; + for (size_t i = 0; i < 16; i++) { + content[i] = 'a' + i; + } + std::unique_ptr raf = + absl::make_unique(content, 16); + int64 buffer_size = 8; + FileBufferInputStream stream(raf.get(), buffer_size); + const uint8_t* data; + size_t len = 3; + ASSERT_TRUE(stream.next(&data, &len)); + ASSERT_EQ(3, len); + ASSERT_EQ(3, stream.byteCount()); + tensorflow::atds::AssertValueEqual("abc", (char*)data, len); + + stream.skip(7); + ASSERT_EQ(10, stream.byteCount()); + + len = 4; + stream.next(&data, &len); + ASSERT_EQ(4, len); + ASSERT_EQ(14, stream.byteCount()); + tensorflow::atds::AssertValueEqual("klmn", (char*)data, len); +} + +static constexpr size_t OS_BUFFER_SIZE = 1024; + +class StringOutputStream : public avro::OutputStream { + public: + StringOutputStream(string* buf) : buf_(buf), pos_(0) {} + + bool next(uint8_t** data, size_t* len) { + size_t capacity = buf_->capacity(); + if (pos_ == capacity) { + buf_->resize(capacity + OS_BUFFER_SIZE); + } + *data = + reinterpret_cast(const_cast(&(buf_->c_str())[pos_])); + size_t new_capacity = buf_->capacity(); + *len = new_capacity - pos_; + pos_ = new_capacity; + return true; + } + + void backup(size_t len) { pos_ -= len; } + + uint64_t byteCount() const { return pos_; } + + void flush() {} + + private: + string* buf_; + size_t pos_; +}; + +static constexpr int64 BUFFER_SIZE = 1024; + +Status AvroBlockReaderTest(char* content, size_t byte_count) { + std::unique_ptr raf = + absl::make_unique(content, byte_count); + std::unique_ptr reader = + absl::make_unique(raf.get(), BUFFER_SIZE); + AvroBlock blk; + return reader->ReadBlock(blk); +} + +template +void AvroBlockReaderTest(char* expected_content, int64_t object_count, + size_t expected_byte_count, avro::ValidSchema& schema, + const std::vector& data) { + string buf; + std::unique_ptr os = + absl::make_unique(&buf); + avro::DataFileWriter writer(std::move(os), schema); + for (T datum : data) { + writer.write(datum); + } + writer.flush(); + writer.close(); + + std::unique_ptr raf = + absl::make_unique(const_cast(buf.c_str()), + buf.capacity()); + std::unique_ptr reader = + absl::make_unique(raf.get(), BUFFER_SIZE); + tensorflow::atds::AssertValueEqual(schema, reader->GetSchema()); + AvroBlock blk; + Status status = reader->ReadBlock(blk); + ASSERT_TRUE(status.ok()); + tensorflow::atds::AssertValueEqual(avro::NULL_CODEC, blk.codec); + tensorflow::atds::AssertValueEqual(object_count, blk.object_count); + tensorflow::atds::AssertValueEqual(expected_byte_count, blk.byte_count); + tensorflow::atds::AssertValueEqual(expected_content, blk.content.c_str(), + blk.byte_count); +} + +/* + * These bytes assume the Avro file format specified here: + * https://avro.apache.org/docs/1.9.1/spec.html#Object+Container+Files Bytes + * were manually generated via: + * 1. Writing schema to a file (schema.avsc): + * { + * "type" : "record", + * "name" : "row", + * "fields" : [ + * { + * "name": "dense_1d", + * "type": { + * "type": "array", + * "items": "int" + * } + * }, + * { + * "name": "dense_2d", + * "type": { + * "type": "array", + * "items": { + * "type": "array", + * "items": "int" + * } + * } + * } + * ] + * } + * 2. Writing test data to a file (test.json): + * { + * "dense_1d": [1, 2, 3], + * "dense_2d": [[4, 5], [6, 7]] + * } + * 3. Converting json to avro: + * dali avro fromjson test.json --schema-file schema.avsc > test.avro + * + * If avro file format changes, this byte array will need to be regenerated, and + * test cases modified to change different byte locations in the array. + */ +static constexpr size_t BYTEARRAY_SIZE = 268; +static constexpr char WELLFORMED_CONTENT[] = { + 0x4f, + 0x62, + 0x6a, + 0x01, + 0x04, + 0x16, + 0x61, + 0x76, + 0x72, + 0x6f, + 0x2e, + 0x73, + 0x63, + 0x68, + 0x65, + 0x6d, // Obj...avro.schem + 0x61, + static_cast(0xec), + 0x02, + 0x7b, + 0x22, + 0x74, + 0x79, + 0x70, + 0x65, + 0x22, + 0x3a, + 0x22, + 0x72, + 0x65, + 0x63, + 0x6f, // a..{"type":"reco + 0x72, + 0x64, + 0x22, + 0x2c, + 0x22, + 0x6e, + 0x61, + 0x6d, + 0x65, + 0x22, + 0x3a, + 0x22, + 0x72, + 0x6f, + 0x77, + 0x22, // rd","name":"row" + 0x2c, + 0x22, + 0x66, + 0x69, + 0x65, + 0x6c, + 0x64, + 0x73, + 0x22, + 0x3a, + 0x5b, + 0x7b, + 0x22, + 0x6e, + 0x61, + 0x6d, // ,"fields":[{"nam + 0x65, + 0x22, + 0x3a, + 0x22, + 0x64, + 0x65, + 0x6e, + 0x73, + 0x65, + 0x5f, + 0x31, + 0x64, + 0x22, + 0x2c, + 0x22, + 0x74, // e":"dense_1d","t + 0x79, + 0x70, + 0x65, + 0x22, + 0x3a, + 0x7b, + 0x22, + 0x74, + 0x79, + 0x70, + 0x65, + 0x22, + 0x3a, + 0x22, + 0x61, + 0x72, // ype":{"type":"ar + 0x72, + 0x61, + 0x79, + 0x22, + 0x2c, + 0x22, + 0x69, + 0x74, + 0x65, + 0x6d, + 0x73, + 0x22, + 0x3a, + 0x22, + 0x69, + 0x6e, // ray","items":"in + 0x74, + 0x22, + 0x7d, + 0x7d, + 0x2c, + 0x7b, + 0x22, + 0x6e, + 0x61, + 0x6d, + 0x65, + 0x22, + 0x3a, + 0x22, + 0x64, + 0x65, // t"}},{"name":"de + 0x6e, + 0x73, + 0x65, + 0x5f, + 0x32, + 0x64, + 0x22, + 0x2c, + 0x22, + 0x74, + 0x79, + 0x70, + 0x65, + 0x22, + 0x3a, + 0x7b, // nse_2d","type":{ + 0x22, + 0x74, + 0x79, + 0x70, + 0x65, + 0x22, + 0x3a, + 0x22, + 0x61, + 0x72, + 0x72, + 0x61, + 0x79, + 0x22, + 0x2c, + 0x22, // "type":"array"," + 0x69, + 0x74, + 0x65, + 0x6d, + 0x73, + 0x22, + 0x3a, + 0x7b, + 0x22, + 0x74, + 0x79, + 0x70, + 0x65, + 0x22, + 0x3a, + 0x22, // items":{"type":" + 0x61, + 0x72, + 0x72, + 0x61, + 0x79, + 0x22, + 0x2c, + 0x22, + 0x69, + 0x74, + 0x65, + 0x6d, + 0x73, + 0x22, + 0x3a, + 0x22, // array","items":" + 0x69, + 0x6e, + 0x74, + 0x22, + 0x7d, + 0x7d, + 0x7d, + 0x5d, + 0x7d, + 0x14, + 0x61, + 0x76, + 0x72, + 0x6f, + 0x2e, + 0x63, // int"}}}]}.avro.c + 0x6f, + 0x64, + 0x65, + 0x63, + 0x08, + 0x6e, + 0x75, + 0x6c, + 0x6c, + 0x00, + static_cast(0xe1), + 0x26, + 0x18, + 0x0e, + static_cast(0xc9), + static_cast(0xbe), // odec.null..&.... + 0x5a, + static_cast(0x8c), + 0x5f, + static_cast(0xe0), + static_cast(0xcd), + 0x5c, + 0x62, + static_cast(0xc2), + 0x3f, + 0x05, + 0x02, + 0x1e, + 0x06, + 0x02, + 0x04, + 0x06, // Z._..\b.?....... + 0x00, + 0x04, + 0x04, + 0x08, + 0x0a, + 0x00, + 0x04, + 0x0c, + 0x0e, + 0x00, + 0x00, + static_cast(0xe1), + 0x26, + 0x18, + 0x0e, + static_cast(0xc9), // ............&... + static_cast(0xbe), + 0x5a, + static_cast(0x8c), + 0x5f, + static_cast(0xe0), + static_cast(0xcd), + 0x5c, + 0x62, + static_cast(0xc2), + 0x3f, + 0x05, + 0x0a // .Z._..\b.?.. +}; + +TEST(AvroBlockReaderTest, MALFORMED_MAGIC) { + char malformed_magic[BYTEARRAY_SIZE]; + memcpy(malformed_magic, WELLFORMED_CONTENT, BYTEARRAY_SIZE); + malformed_magic[2] = 0x6b; // Fill third byte with random character + avro::Exception expected_exception("No exception thrown"); + try { + AvroBlockReaderTest(malformed_magic, BYTEARRAY_SIZE); + } catch (avro::Exception e) { + expected_exception = e; + } + ASSERT_STREQ("Invalid data file. Magic does not match.", + expected_exception.what()); +} + +TEST(AvroBlockReaderTest, MISSING_SCHEMA) { + char missing_schema[BYTEARRAY_SIZE]; + memcpy(missing_schema, WELLFORMED_CONTENT, BYTEARRAY_SIZE); + missing_schema[6] = 0x62; // Replace "avro.schema" with "bvro.schema" + avro::Exception expected_exception("No exception thrown"); + try { + AvroBlockReaderTest(missing_schema, BYTEARRAY_SIZE); + } catch (avro::Exception e) { + expected_exception = e; + } + ASSERT_STREQ("No schema in metadata", expected_exception.what()); +} + +TEST(AvroBlockReaderTest, UNSUPPORTED_CODEC) { + char unsupported_codec[BYTEARRAY_SIZE]; + memcpy(unsupported_codec, WELLFORMED_CONTENT, BYTEARRAY_SIZE); + unsupported_codec[213] = 0x6f; // Change codec from "null" to "oull" + avro::Exception expected_exception("No exception thrown"); + try { + AvroBlockReaderTest(unsupported_codec, BYTEARRAY_SIZE); + } catch (avro::Exception e) { + expected_exception = e; + } + ASSERT_STREQ("Unknown codec in data file: oull", expected_exception.what()); +} + +TEST(AvroBlockReaderTest, SYNC_MARKER_MISMATCH) { + char sync_marker_mismatch[BYTEARRAY_SIZE]; + memcpy(sync_marker_mismatch, WELLFORMED_CONTENT, BYTEARRAY_SIZE); + sync_marker_mismatch[218] = + 0xe2; // Change second byte of sync marker from 0xe1 to 0xe2 + Status status = AvroBlockReaderTest(sync_marker_mismatch, BYTEARRAY_SIZE); + ASSERT_EQ(error::Code::DATA_LOSS, status.code()); + ASSERT_STREQ("Avro sync marker mismatch.", status.error_message().c_str()); +} + +TEST(AvroBlockReaderTest, BYTE_COUNT_EOF) { + char byte_count_eof[BYTEARRAY_SIZE]; + memcpy(byte_count_eof, WELLFORMED_CONTENT, BYTEARRAY_SIZE); + byte_count_eof[235] = 0x6e; // Change byte count from 0x1e (15) to 0x6e (55) + Status status = AvroBlockReaderTest(byte_count_eof, BYTEARRAY_SIZE); + ASSERT_EQ(error::Code::OUT_OF_RANGE, status.code()); + ASSERT_STREQ("eof", status.error_message().c_str()); +} + +TEST(AvroBlockReaderTest, DENSE_2D) { + string feature_name = "dense_2d"; + tensorflow::atds::ATDSSchemaBuilder schema_builder = + tensorflow::atds::ATDSSchemaBuilder(); + schema_builder.AddDenseFeature(feature_name, DT_INT32, 2); + avro::ValidSchema schema = schema_builder.BuildVaildSchema(); + avro::GenericDatum datum(schema); + tensorflow::atds::AddDenseValue(datum, feature_name, {{1, 2}, {3, 4}}); + avro::OutputStreamPtr out_stream = + tensorflow::atds::EncodeAvroGenericDatum(datum); + avro::InputStreamPtr in_stream = avro::memoryInputStream(*out_stream); + const uint8_t* expected_content; + size_t expected_len; + in_stream->next(&expected_content, &expected_len); + AvroBlockReaderTest((char*)expected_content, 1, + expected_len, schema, {datum}); +} + +TEST(AvroBlockReaderTest, SPARSE_2D) { + string feature_name = "sparse_2d"; + tensorflow::atds::ATDSSchemaBuilder schema_builder = + tensorflow::atds::ATDSSchemaBuilder(); + schema_builder.AddSparseFeature(feature_name, DT_INT64, 2); + avro::ValidSchema schema = schema_builder.BuildVaildSchema(); + avro::GenericDatum datum1(schema); + avro::GenericDatum datum2(schema); + tensorflow::atds::AddSparseValue(datum1, feature_name, + {{1, 2}, {3, 4}}, {5, 6}); + tensorflow::atds::AddSparseValue(datum2, feature_name, + {{7, 8}, {9, 10}}, {11, 12}); + std::vector records = {datum1, datum2}; + avro::OutputStreamPtr out_stream = + tensorflow::atds::EncodeAvroGenericData(records); + avro::InputStreamPtr in_stream = avro::memoryInputStream(*out_stream); + const uint8_t* expected_content; + size_t expected_len; + in_stream->next(&expected_content, &expected_len); + AvroBlockReaderTest( + (char*)expected_content, 2, expected_len, schema, {datum1, datum2}); +} + +} // namespace data +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/avro_decoder_template.h b/tensorflow_io/core/kernels/avro/atds/avro_decoder_template.h new file mode 100644 index 000000000..3588cc28a --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/avro_decoder_template.h @@ -0,0 +1,61 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_AVRO_DECODER_TEMPLATE_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_AVRO_DECODER_TEMPLATE_H_ + +#include "api/Decoder.hh" + +namespace avro { +namespace decoder_t { + +template < + typename T, + typename = typename std::enable_if< + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value, + T>::type> +inline T Decode(avro::DecoderPtr& decoder); + +template <> +inline int Decode(avro::DecoderPtr& decoder) { + return decoder->decodeInt(); +} + +template <> +inline long Decode(avro::DecoderPtr& decoder) { + return decoder->decodeLong(); +} + +template <> +inline float Decode(avro::DecoderPtr& decoder) { + return decoder->decodeFloat(); +} + +template <> +inline double Decode(avro::DecoderPtr& decoder) { + return decoder->decodeDouble(); +} + +template <> +inline bool Decode(avro::DecoderPtr& decoder) { + return decoder->decodeBool(); +} + +} // namespace decoder_t +} // namespace avro + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_AVRO_DECODER_TEMPLATE_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/decoder_base.h b/tensorflow_io/core/kernels/avro/atds/decoder_base.h new file mode 100644 index 000000000..456984ae3 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/decoder_base.h @@ -0,0 +1,70 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_BASE_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_BASE_H_ + +#include "api/Decoder.hh" +#include "api/GenericDatum.hh" +#include "api/Node.hh" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow_io/core/kernels/avro/atds/sparse_value_buffer.h" + +namespace tensorflow { +namespace atds { + +enum class FeatureType { dense, sparse, varlen, opaque_contextual }; + +static const std::map avro_to_tf_datatype = { + {avro::AVRO_INT, DT_INT32}, {avro::AVRO_LONG, DT_INT64}, + {avro::AVRO_STRING, DT_STRING}, {avro::AVRO_BYTES, DT_STRING}, + {avro::AVRO_FLOAT, DT_FLOAT}, {avro::AVRO_DOUBLE, DT_DOUBLE}, + {avro::AVRO_BOOL, DT_BOOL}}; + +/* + * Decoders decode avro features into Tensors. + * All decoder implementations must implement the operator overload '()'. + * Decoders are invoked in a multithreaded context(controlled by + * `num_parallel_calls`). Therefore the implementations must be threadsafe. + * TODO: Add static analysis to check thread-safety(BDP-7562) + * */ +class DecoderBase { + public: + virtual ~DecoderBase() {} + + virtual Status operator()(avro::DecoderPtr&, std::vector&, + sparse::ValueBuffer&, + std::vector&, size_t) = 0; +}; + +/* + * Template Metadata class must implement the following public members. + * FeatureType type + * string name + * DataType dtype + * PartialTensorShape shape + * */ +template +std::unique_ptr CreateFeatureDecoder(const avro::NodePtr&, + const Metadata&); + +template +Status ValidateSchema(const avro::NodePtr&, const Metadata&); + +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_BASE_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/decoder_test_util.cc b/tensorflow_io/core/kernels/avro/atds/decoder_test_util.cc new file mode 100644 index 000000000..bfb88fa2d --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/decoder_test_util.cc @@ -0,0 +1,208 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/decoder_test_util.h" + +#include "api/Compiler.hh" +#include "api/Generic.hh" +#include "api/Specific.hh" +#include "api/ValidSchema.hh" + +namespace tensorflow { +namespace atds { + +constexpr const char kATDSSchemaPrefix[] = + "{" + "\"type\" : \"record\", " + "\"name\" : \"AvroTensorDataset\", " + "\"namespace\" : \"com.organization.avrotensordataset\", " + "\"fields\" : [ "; + +constexpr const char kATDSSchemaSuffix[] = + " ] " + "}"; + +ATDSSchemaBuilder::ATDSSchemaBuilder() + : schema_(kATDSSchemaPrefix), num_of_features_(0) {} + +ATDSSchemaBuilder& ATDSSchemaBuilder::AddDenseFeature( + const string& name, DataType dtype, size_t rank, + const avro::Type avro_type) { + string type = GenerateArrayType(dtype, rank, avro_type); + string feature_schema = BuildFeatureSchema(name, type); + AddFeature(feature_schema); + return *this; +} + +ATDSSchemaBuilder& ATDSSchemaBuilder::AddSparseFeature( + const string& name, DataType dtype, size_t rank, + const avro::Type avro_type) { + std::vector order(rank + 1, 0); + for (size_t i = 0; i < order.size(); i++) { + order[i] = i; + } + AddSparseFeature(name, dtype, order, avro_type); + return *this; +} + +ATDSSchemaBuilder& ATDSSchemaBuilder::AddSparseFeature( + const string& name, DataType dtype, const std::vector& order, + const avro::Type avro_type) { + string indices_type = GenerateArrayType(DT_INT64, 1); + string values_type = GenerateArrayType(dtype, 1, avro_type); + string fields = ""; + + auto values_index = order.size() - 1; + for (size_t i = 0; i < order.size(); i++) { + if (i > 0) { + fields += ", "; + } + if (order[i] == values_index) { + fields += BuildFeatureSchema("values", values_type); + } else { + auto indices_name = "indices" + std::to_string(order[i]); + fields += BuildFeatureSchema(indices_name, indices_type); + } + } + + string type = + "{" + "\"type\" : \"record\", " + "\"name\" : \"" + + name + + "\", " + "\"fields\" : [ " + + fields + + " ] " + "}"; + string feature_schema = BuildFeatureSchema(name, type); + AddFeature(feature_schema); + return *this; +} + +ATDSSchemaBuilder& ATDSSchemaBuilder::AddOpaqueContextualFeature( + const string& name, const string& type) { + string feature_schema = BuildFeatureSchema(name, type); + AddFeature(feature_schema); + return *this; +} + +string ATDSSchemaBuilder::Build() { return schema_ + kATDSSchemaSuffix; } + +avro::ValidSchema ATDSSchemaBuilder::BuildVaildSchema() { + string schema = Build(); + + std::istringstream iss(schema); + avro::ValidSchema valid_schema; + avro::compileJsonSchema(iss, valid_schema); + return valid_schema; +} + +void ATDSSchemaBuilder::AddFeature(const string& feature_schema) { + if (num_of_features_ > 0) { + schema_ += ", "; + } + schema_ += feature_schema; + num_of_features_++; +} + +string ATDSSchemaBuilder::BuildFeatureSchema(const string& name, + const string& type) { + return "{" + "\"name\" : \"" + + name + + "\", " + "\"type\" : " + + type + " }"; +} + +string ATDSSchemaBuilder::BuildNullableFeatureSchema(const string& name, + const string& type) { + return "{" + "\"name\" : \"" + + name + + "\", " + "\"type\" : [ \"null\", " + + type + + " ] " + "}"; +} + +string ATDSSchemaBuilder::GenerateDataType(DataType dtype, + const avro::Type avro_type) { + switch (dtype) { + case DT_INT32: { + return "\"int\""; + } + case DT_INT64: { + return "\"long\""; + } + case DT_FLOAT: { + return "\"float\""; + } + case DT_DOUBLE: { + return "\"double\""; + } + case DT_STRING: { + if (avro_type == avro::AVRO_BYTES) { + return "\"bytes\""; + } + return "\"string\""; + } + case DT_BOOL: { + return "\"boolean\""; + } + default: { + return ""; + } + } +} + +string ATDSSchemaBuilder::GenerateArrayType(DataType dtype, size_t rank, + const avro::Type avro_type) { + if (rank == 0) { + return GenerateDataType(dtype, avro_type); + } + + string type = GenerateArrayType(dtype, rank - 1, avro_type); + return "{" + "\"type\" : \"array\", " + "\"items\" : " + + type + " }"; +} + +avro::OutputStreamPtr EncodeAvroGenericDatum(avro::GenericDatum& datum) { + avro::EncoderPtr encoder = avro::binaryEncoder(); + avro::OutputStreamPtr out_stream = avro::memoryOutputStream(); + encoder->init(*out_stream); + avro::encode(*encoder, datum); + encoder->flush(); + return std::move(out_stream); +} + +avro::OutputStreamPtr EncodeAvroGenericData( + std::vector& data) { + avro::EncoderPtr encoder = avro::binaryEncoder(); + avro::OutputStreamPtr out_stream = avro::memoryOutputStream(); + encoder->init(*out_stream); + for (auto& datum : data) { + avro::encode(*encoder, datum); + } + encoder->flush(); + return std::move(out_stream); +} + +} // namespace atds +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/decoder_test_util.h b/tensorflow_io/core/kernels/avro/atds/decoder_test_util.h new file mode 100644 index 000000000..1407c5b90 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/decoder_test_util.h @@ -0,0 +1,404 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_TEST_UTIL_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_TEST_UTIL_H_ + +#include "api/Encoder.hh" +#include "api/GenericDatum.hh" +#include "api/Node.hh" +#include "api/Specific.hh" +#include "api/Stream.hh" +#include "api/ValidSchema.hh" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow_io/core/kernels/avro/atds/atds_decoder.h" + +namespace tensorflow { +namespace atds { + +using byte_array = std::vector; + +class ATDSSchemaBuilder { + public: + ATDSSchemaBuilder(); + + ATDSSchemaBuilder& AddDenseFeature( + const string& name, DataType dtype, size_t rank, + const avro::Type avro_type = avro::AVRO_NULL); + ATDSSchemaBuilder& AddSparseFeature( + const string& name, DataType dtype, size_t rank, + const avro::Type avro_type = avro::AVRO_NULL); + ATDSSchemaBuilder& AddSparseFeature( + const string& name, DataType dtype, const std::vector& order, + const avro::Type avro_type = avro::AVRO_NULL); + ATDSSchemaBuilder& AddOpaqueContextualFeature(const string& name, + const string& type); + + string Build(); + avro::ValidSchema BuildVaildSchema(); + + private: + void AddFeature(const string&); + string BuildFeatureSchema(const string&, const string&); + string BuildNullableFeatureSchema(const string&, const string&); + string GenerateDataType(DataType, const avro::Type = avro::AVRO_NULL); + string GenerateArrayType(DataType, size_t, + const avro::Type = avro::AVRO_NULL); + + string schema_; + size_t num_of_features_; +}; + +template +DataType GetDataType() { + return DataTypeToEnum().value; +} + +template <> +inline DataType GetDataType() { + return DT_STRING; +} + +inline std::vector StringToByte(const std::string& s) { + std::vector result; + result.reserve(s.size()); + std::copy(s.begin(), s.end(), std::back_inserter(result)); + return result; +} + +inline std::string ByteToString(const std::vector& t) { + std::string result; + std::copy(t.begin(), t.end(), std::back_inserter(result)); + return result; +} + +// avro::Type is used to differentiate between byte and string, both of which +// map to datatype +template +void AddDenseValue(avro::GenericDatum& datum, const string& name, + const T& value) { + auto& record = datum.value(); + auto& feature = record.field(name); + feature.value() = value; +} + +template +void AddDenseValue(avro::GenericDatum& datum, const string& name, + const std::vector& values) { + auto& record = datum.value(); + auto& feature = record.field(name).value(); + auto& feature_values = feature.value(); + for (T value : values) { + feature_values.emplace_back(value); + } +} + +template <> +inline void AddDenseValue(avro::GenericDatum& datum, const string& name, + const byte_array& value) { + auto& record = datum.value(); + auto& feature = record.field(name); + feature.value() = value; +} + +template +inline void AddDenseValue(avro::GenericDatum& datum, const string& name, + const std::vector>& values) { + auto& record = datum.value(); + auto& feature = record.field(name).value(); + auto& sub_array_schema = feature.schema()->leafAt(0); + + auto& feature_values = feature.value(); + for (size_t i = 0; i < values.size(); i++) { + feature_values.emplace_back(sub_array_schema); + auto& sub_array = feature_values.back().value().value(); + for (size_t j = 0; j < values[i].size(); j++) { + sub_array.emplace_back(values[i][j]); + } + } +} + +template <> +inline void AddDenseValue(avro::GenericDatum& datum, const string& name, + const std::vector& values) { + auto& record = datum.value(); + auto& feature = record.field(name).value(); + auto& feature_values = feature.value(); + for (byte_array value : values) { + feature_values.emplace_back(value); + } +} + +template +void AddSparseValue(avro::GenericDatum& datum, const string& name, + const std::vector>& indices, + const std::vector& values) { + auto& record = datum.value(); + auto& feature = record.field(name).value(); + + for (size_t i = 0; i < indices.size(); i++) { + auto indices_key = "indices" + std::to_string(i); + auto& indices_array = + feature.field(indices_key).value().value(); + for (long index : indices[i]) { + indices_array.emplace_back(static_cast(index)); + } + } + + auto& values_array = + feature.field("values").value().value(); + for (T value : values) { + values_array.emplace_back(value); + } +} + +avro::OutputStreamPtr EncodeAvroGenericDatum(avro::GenericDatum& datum); +avro::OutputStreamPtr EncodeAvroGenericData( + std::vector& data); + +template +void AssertValueEqual(const T& v1, const F& v2) { + ASSERT_EQ(v1, v2); +} + +template <> +inline void AssertValueEqual(const avro::NodePtr& v1, const avro::NodePtr& v2) { + ASSERT_EQ(v1->type(), v2->type()); + ASSERT_EQ(v1->leaves(), v2->leaves()); + for (size_t i = 0; i < v1->leaves(); i++) { + AssertValueEqual(v1->leafAt(i), v2->leafAt(i)); + } +} + +template <> +inline void AssertValueEqual(const avro::ValidSchema& v1, + const avro::ValidSchema& v2) { + AssertValueEqual(v1.root(), v2.root()); +} + +template <> +inline void AssertValueEqual(const tstring& v1, const string& v2) { + ASSERT_STREQ(v1.c_str(), v2.c_str()); +} + +template <> +inline void AssertValueEqual(const string& v1, const tstring& v2) { + ASSERT_STREQ(v1.c_str(), v2.c_str()); +} + +inline void AssertValueEqual(const char* v1, const char* v2, int len) { + for (int i = 0; i < len; i++) { + ASSERT_EQ(v1[i], v2[i]); + } +} + +template <> +inline void AssertValueEqual(const float& v1, const float& v2) { + ASSERT_NEAR(v1, v2, 1e-6); +} + +template <> +inline void AssertValueEqual(const double& v1, const double& v2) { + ASSERT_NEAR(v1, v2, 1e-6); +} + +template +void AssertVectorValues(const std::vector& actual, + const std::vector& expected) { + ASSERT_EQ(actual.size(), expected.size()); + for (size_t i = 0; i < expected.size(); i++) { + AssertValueEqual(actual[i], expected[i]); + } +} + +template +inline void AssertVectorValues(const std::vector& actual, + const std::vector& expected) { + ASSERT_EQ(actual.size(), expected.size()); + for (size_t i = 0; i < expected.size(); i++) { + AssertValueEqual(actual[i], ByteToString(expected[i])); + } +} + +template +void AssertTensorValues(const Tensor& tensor, const T& scalar) { + AssertValueEqual(tensor.scalar()(), scalar); +} + +template <> +inline void AssertTensorValues(const Tensor& tensor, const string& scalar) { + AssertValueEqual(tensor.scalar()(), scalar); +} + +template +void AssertTensorValues(const Tensor& tensor, const std::vector& vec) { + for (size_t i = 0; i < vec.size(); i++) { + AssertValueEqual(tensor.vec()(i), vec[i]); + } + ASSERT_EQ(tensor.NumElements(), vec.size()); +} + +template <> +inline void AssertTensorValues(const Tensor& tensor, const byte_array& scalar) { + AssertValueEqual(tensor.scalar()(), ByteToString(scalar)); +} + +template <> +inline void AssertTensorValues(const Tensor& tensor, + const std::vector& vec) { + for (size_t i = 0; i < vec.size(); i++) { + AssertValueEqual(tensor.vec()(i), vec[i]); + } + ASSERT_EQ(tensor.NumElements(), vec.size()); +} + +template +void AssertTensorValues(const Tensor& tensor, + const std::vector>& matrix) { + size_t size = 0; + for (size_t i = 0; i < matrix.size(); i++) { + for (size_t j = 0; j < matrix[i].size(); j++) { + AssertValueEqual(tensor.matrix()(i, j), matrix[i][j]); + } + size += matrix[i].size(); + } + ASSERT_EQ(tensor.NumElements(), size); +} + +template <> +inline void AssertTensorValues(const Tensor& tensor, + const std::vector& vec) { + for (size_t i = 0; i < vec.size(); i++) { + AssertValueEqual(tensor.vec()(i), ByteToString(vec[i])); + } + ASSERT_EQ(tensor.NumElements(), vec.size()); +} + +template <> +inline void AssertTensorValues(const Tensor& tensor, + const std::vector>& matrix) { + size_t size = 0; + for (size_t i = 0; i < matrix.size(); i++) { + for (size_t j = 0; j < matrix[i].size(); j++) { + AssertValueEqual(tensor.matrix()(i, j), matrix[i][j]); + } + size += matrix[i].size(); + } + ASSERT_EQ(tensor.NumElements(), size); +} + +template <> +inline void AssertTensorValues( + const Tensor& tensor, const std::vector>& matrix) { + size_t size = 0; + for (size_t i = 0; i < matrix.size(); i++) { + for (size_t j = 0; j < matrix[i].size(); j++) { + AssertValueEqual(tensor.matrix()(i, j), + ByteToString(matrix[i][j])); + } + size += matrix[i].size(); + } + ASSERT_EQ(tensor.NumElements(), size); +} + +template +void AssertTensorRangeEqual(const Tensor& tensor, std::vector values, + size_t offset) { + for (size_t i = 0; i < values.size(); i++) { + T actual = tensor.vec()(offset + i); + AssertValueEqual(actual, values[i]); + } +} + +template <> +inline void AssertTensorRangeEqual(const Tensor& tensor, + std::vector values, size_t offset) { + for (size_t i = 0; i < values.size(); i++) { + tstring actual = tensor.vec()(offset + i); + AssertValueEqual(actual, values[i]); + } +} + +template +void ValidateBuffer(sparse::ValueBuffer& buffer, const Metadata& metadata, + std::vector indices, std::vector values, + std::vector num_of_elements) { + size_t indices_index = metadata.indices_index; + size_t values_index = metadata.values_index; + + AssertVectorValues(buffer.indices[indices_index], indices); + std::vector& actual_values = + sparse::GetValueVector(buffer, values_index); + AssertVectorValues(actual_values, values); + AssertVectorValues(buffer.num_of_elements[indices_index], num_of_elements); +} + +template +void ValidateBuffer(sparse::ValueBuffer& buffer, const Metadata& metadata, + std::vector indices, std::vector values, + std::vector num_of_elements) { + size_t indices_index = metadata.indices_index; + size_t values_index = metadata.values_index; + + AssertVectorValues(buffer.indices[indices_index], indices); + std::vector& actual_values = + sparse::GetValueVector(buffer, values_index); + AssertVectorValues(actual_values, values); + AssertVectorValues(buffer.num_of_elements[indices_index], num_of_elements); +} + +namespace sparse { + +template +std::vector>& GetValuesBuffer(ValueBuffer& buffer); + +template <> +inline std::vector>& GetValuesBuffer(ValueBuffer& buffer) { + return buffer.int_values; +} + +template <> +inline std::vector>& GetValuesBuffer(ValueBuffer& buffer) { + return buffer.long_values; +} + +template <> +inline std::vector>& GetValuesBuffer(ValueBuffer& buffer) { + return buffer.float_values; +} + +template <> +inline std::vector>& GetValuesBuffer(ValueBuffer& buffer) { + return buffer.double_values; +} + +template <> +inline std::vector>& GetValuesBuffer(ValueBuffer& buffer) { + return buffer.string_values; +} + +template <> +inline std::vector>& GetValuesBuffer(ValueBuffer& buffer) { + return buffer.bool_values; +} + +} // namespace sparse + +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECODER_TEST_UTIL_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/decompression_handler.h b/tensorflow_io/core/kernels/avro/atds/decompression_handler.h new file mode 100644 index 000000000..d3aed08bc --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/decompression_handler.h @@ -0,0 +1,120 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECOMPRESSION_HANDLER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECOMPRESSION_HANDLER_H_ + +#include // for boost::crc_32_type +#include +#include +#include +#include + +#include "api/Compiler.hh" +#include "api/DataFile.hh" +#include "api/Decoder.hh" +#include "api/Specific.hh" +#include "api/Stream.hh" +#include "api/ValidSchema.hh" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow_io/core/kernels/avro/atds/avro_block_reader.h" + +#ifdef SNAPPY_CODEC_AVAILABLE +#include +#endif +namespace tensorflow { +namespace data { +class DecompressionHandler { + public: + DecompressionHandler() {} + + // Adapted from + // https://github.com/apache/avro/blob/release-1.9.1/lang/c++/impl/DataFile.cc#L58 + boost::iostreams::zlib_params get_zlib_params() { + boost::iostreams::zlib_params ret; + ret.method = boost::iostreams::zlib::deflated; + ret.noheader = true; + return ret; + } + +#ifdef SNAPPY_CODEC_AVAILABLE + avro::InputStreamPtr decompressSnappyCodec(AvroBlock& block) { + boost::crc_32_type crc; + std::string uncompressed; + size_t len = block.content.size(); + const auto& compressed = block.content; + int b1 = compressed[len - 4] & 0xFF; + int b2 = compressed[len - 3] & 0xFF; + int b3 = compressed[len - 2] & 0xFF; + int b4 = compressed[len - 1] & 0xFF; + + uint32_t checksum = (b1 << 24) + (b2 << 16) + (b3 << 8) + (b4); + if (!snappy::Uncompress(compressed.data(), len - 4, &uncompressed)) { + throw avro::Exception( + "Snappy Compression reported an error when decompressing"); + } + crc.process_bytes(uncompressed.data(), uncompressed.size()); + uint32_t c = crc(); + if (checksum != c) { + throw avro::Exception( + boost::format("Checksum did not match for Snappy compression: " + "Expected: %1%, computed: %2%") % + checksum % c); + } + block.content = uncompressed; + block.byte_count = uncompressed.size(); + block.codec = avro::NULL_CODEC; + uint8_t* dt = + reinterpret_cast(block.content.data() + block.read_offset); + return avro::memoryInputStream(dt, + block.content.size() - block.read_offset); + } +#endif + + avro::InputStreamPtr decompressDeflateCodec(AvroBlock& block) { + boost::iostreams::filtering_istream stream; + stream.push(boost::iostreams::zlib_decompressor(get_zlib_params())); + stream.push(boost::iostreams::basic_array_source( + block.content.data(), block.content.size())); + auto uncompressed = tstring(); + auto reader = avro::nonSeekableIstreamInputStream(stream); + size_t n_data = 0; + + const uint8_t* data = nullptr; + while (reader->next(&data, &n_data)) { + uncompressed.append((const char*)data, n_data); + } + block.content = uncompressed; + block.codec = avro::NULL_CODEC; + block.byte_count = uncompressed.size(); + uint8_t* dt = + reinterpret_cast(block.content.data() + block.read_offset); + return avro::memoryInputStream(dt, + block.content.size() - block.read_offset); + } + + avro::InputStreamPtr decompressNullCodec(AvroBlock& block) { + size_t offset = block.read_offset; + uint8_t* data = reinterpret_cast(block.content.data() + offset); + size_t size = block.content.size() - offset; + return avro::memoryInputStream(data, size); + } +}; + +} // namespace data +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DECOMPRESSION_HANDLER_H_ \ No newline at end of file diff --git a/tensorflow_io/core/kernels/avro/atds/dense_feature_decoder.h b/tensorflow_io/core/kernels/avro/atds/dense_feature_decoder.h new file mode 100644 index 000000000..b24d3854c --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/dense_feature_decoder.h @@ -0,0 +1,244 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DENSE_FEATURE_DECODER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DENSE_FEATURE_DECODER_H_ + +#include "api/Decoder.hh" +#include "api/Node.hh" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow_io/core/kernels/avro/atds/avro_decoder_template.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_base.h" +#include "tensorflow_io/core/kernels/avro/atds/errors.h" + +namespace tensorflow { +namespace atds { + +namespace dense { + +struct Metadata { + Metadata(FeatureType type, const string& name, DataType dtype, + const PartialTensorShape& shape, size_t tensor_position) + : type(type), + name(name), + dtype(dtype), + shape(shape), + tensor_position(tensor_position) {} + + FeatureType type; + string name; + DataType dtype; + PartialTensorShape shape; + + size_t tensor_position; +}; + +template +inline Status DecodeFixedLenArray(avro::DecoderPtr& decoder, T** buf, int rank, + const PartialTensorShape& shape) { + if (rank == 0) { + *((*buf)++) = avro::decoder_t::Decode(decoder); + return OkStatus(); + } + + int dim = shape.dims() - rank; + size_t size = static_cast(shape.dim_size(dim)); + size_t number = 0; + if (rank == 1) { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + number += m; + if (TF_PREDICT_FALSE(number > size)) { + return ShapeError(number, dim, shape); + } + for (size_t i = 0; i < m; i++) { + *((*buf)++) = avro::decoder_t::Decode(decoder); + } + } + if (TF_PREDICT_FALSE(number != size)) { + return ShapeError(number, dim, shape); + } + return OkStatus(); + } + + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + number += m; + if (TF_PREDICT_FALSE(number > size)) { + return ShapeError(number, dim, shape); + } + for (size_t i = 0; i < m; i++) { + TF_RETURN_IF_ERROR(DecodeFixedLenArray(decoder, buf, rank - 1, shape)); + } + } + if (TF_PREDICT_FALSE(number != size)) { + return ShapeError(number, dim, shape); + } + return OkStatus(); +} + +// This template specification handles both byte and string. +// It assumes that avro decodeBytes and decodeString are both reading bytes into +// uint8 arrays see: +// https://github.com/apache/avro/blob/branch-1.9/lang/c%2B%2B/impl/BinaryDecoder.cc#L133 +// As long as that as that assumption holds a separate bytes implementation is +// not required. +template <> +inline Status DecodeFixedLenArray(avro::DecoderPtr& decoder, tstring** buf, + int rank, const PartialTensorShape& shape) { + std::string s; + if (rank == 0) { + decoder->decodeString(s); + *((*buf)++) = s; + return OkStatus(); + } + + int dim = shape.dims() - rank; + size_t size = static_cast(shape.dim_size(dim)); + size_t number = 0; + if (rank == 1) { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + number += m; + if (TF_PREDICT_FALSE(number > size)) { + return ShapeError(number, dim, shape); + } + for (size_t i = 0; i < m; i++) { + decoder->decodeString(s); + *((*buf)++) = s; + } + } + if (TF_PREDICT_FALSE(number != size)) { + return ShapeError(number, dim, shape); + } + return OkStatus(); + } + + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + number += m; + if (TF_PREDICT_FALSE(number > size)) { + return ShapeError(number, dim, shape); + } + for (size_t i = 0; i < m; i++) { + TF_RETURN_IF_ERROR( + DecodeFixedLenArray(decoder, buf, rank - 1, shape)); + } + } + if (TF_PREDICT_FALSE(number != size)) { + return ShapeError(number, dim, shape); + } + return OkStatus(); +} + +template +class FeatureDecoder : public DecoderBase { + public: + explicit FeatureDecoder(const Metadata& metadata) + : metadata_(metadata), rank_(metadata.shape.dims()) {} + + Status operator()(avro::DecoderPtr& decoder, + std::vector& dense_tensors, + sparse::ValueBuffer& buffer, + std::vector& skipped_data, + size_t offset) { + auto size = metadata_.shape.num_elements(); + auto& tensor = dense_tensors[metadata_.tensor_position]; + T* buf = reinterpret_cast(tensor.data()) + offset * size; + return DecodeFixedLenArray(decoder, &buf, rank_, metadata_.shape); + } + + private: + const Metadata& metadata_; + const int rank_; +}; + +} // namespace dense + +template <> +inline std::unique_ptr CreateFeatureDecoder( + const avro::NodePtr& node, const dense::Metadata& metadata) { + switch (metadata.dtype) { + case DT_INT32: { + return std::move(std::make_unique>(metadata)); + } + case DT_INT64: { + return std::move(std::make_unique>(metadata)); + } + case DT_FLOAT: { + return std::move( + std::make_unique>(metadata)); + } + case DT_DOUBLE: { + return std::move( + std::make_unique>(metadata)); + } + case DT_STRING: { + return std::move( + std::make_unique>(metadata)); + } + case DT_BOOL: { + return std::move(std::make_unique>(metadata)); + } + default: { + TypeNotSupportedAbort(metadata.dtype); + } + } + return nullptr; +} + +template <> +inline Status ValidateSchema(const avro::NodePtr& node, + const dense::Metadata& metadata) { + avro::NodePtr n = node; + size_t avro_rank = 0; + // Check schema consists of non-nullable nested arrays. + while (n->leaves() != 0) { + if (n->leaves() != 1 || n->type() != avro::AVRO_ARRAY) { + std::ostringstream oss; + node->printJson(oss, 0); + return InvalidDenseFeatureSchema(metadata.name, oss.str()); + } + n = n->leafAt(0); + avro_rank++; + } + avro::Type avro_type = n->type(); + std::map::const_iterator tf_type = + avro_to_tf_datatype.find(avro_type); + if (tf_type == avro_to_tf_datatype.end()) { + // Check schema data type is supported. + std::ostringstream oss; + node->printJson(oss, 0); + return UnsupportedValueTypeError(metadata.name, oss.str()); + } else if (tf_type->second != metadata.dtype) { + // Check schema data type and metadata type match. + std::ostringstream oss; + node->printJson(oss, 0); + return SchemaValueTypeMismatch(metadata.name, avro_type, metadata.dtype, + oss.str()); + } + // Check schema rank and metadata rank match. + size_t metadata_rank = static_cast(metadata.shape.dims()); + if (avro_rank != metadata_rank) { + std::ostringstream oss; + node->printJson(oss, 0); + return FeatureRankMismatch(metadata.name, avro_rank, metadata_rank, + oss.str()); + } + return OkStatus(); +} + +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DENSE_FEATURE_DECODER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/dense_feature_decoder_test.cc b/tensorflow_io/core/kernels/avro/atds/dense_feature_decoder_test.cc new file mode 100644 index 000000000..b972674af --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/dense_feature_decoder_test.cc @@ -0,0 +1,183 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "api/Decoder.hh" +#include "api/Stream.hh" +#include "tensorflow/core/platform/test.h" +#include "tensorflow_io/core/kernels/avro/atds/atds_decoder.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_test_util.h" + +namespace tensorflow { +namespace atds { +namespace dense { + +template +void DenseDecoderTest(const T& values, DataType dtype, + std::initializer_list shape, + const avro::Type avro_type = avro::AVRO_NULL) { + string feature_name = "feature"; + ATDSSchemaBuilder schema_builder = ATDSSchemaBuilder(); + schema_builder.AddDenseFeature(feature_name, dtype, shape.size(), avro_type); + + string schema = schema_builder.Build(); + avro::ValidSchema writer_schema = schema_builder.BuildVaildSchema(); + avro::GenericDatum atds_datum(writer_schema); + AddDenseValue(atds_datum, feature_name, values); + avro::OutputStreamPtr out_stream = EncodeAvroGenericDatum(atds_datum); + avro::InputStreamPtr in_stream = avro::memoryInputStream(*out_stream); + avro::DecoderPtr decoder = avro::binaryDecoder(); + decoder->init(*in_stream); + + std::vector dense_features; + std::vector sparse_features; + std::vector varlen_features; + size_t pos = 0; + PartialTensorShape tensor_shape(shape); + dense_features.emplace_back(FeatureType::dense, feature_name, dtype, + tensor_shape, pos); + + ATDSDecoder atds_decoder = + ATDSDecoder(dense_features, sparse_features, varlen_features); + Status init_status = atds_decoder.Initialize(writer_schema); + ASSERT_TRUE(init_status.ok()); + + sparse::ValueBuffer buffer; + std::vector skipped_data = atds_decoder.GetSkippedData(); + std::vector dense_tensors; + dense_tensors.emplace_back(dtype, TensorShape(shape)); + size_t offset = 0; + + Status decode_status = atds_decoder.DecodeATDSDatum( + decoder, dense_tensors, buffer, skipped_data, offset); + ASSERT_TRUE(decode_status.ok()); + const Tensor tensor = dense_tensors[pos]; + AssertTensorValues(tensor, values); +} + +TEST(DenseDecoderTest, DT_INT32_scalar) { + int value = -7; + DenseDecoderTest(value, DT_INT32, {}); +} + +TEST(DenseDecoderTest, DT_INT32_1D) { + std::vector values = {1, 2, 3}; + DenseDecoderTest(values, DT_INT32, {3}); +} + +TEST(DenseDecoderTest, DT_INT32_2D) { + std::vector> values = {{-1, -2, -3}, {4, 5, 6}, {-7, 8, 9}}; + DenseDecoderTest(values, DT_INT32, {3, 3}); +} + +TEST(DenseDecoderTest, DT_INT64_scalar) { + int64_t value = 1; + DenseDecoderTest(value, DT_INT64, {}); +} + +TEST(DenseDecoderTest, DT_INT64_1D) { + std::vector values = {1}; + DenseDecoderTest(values, DT_INT64, {1}); +} + +TEST(DenseDecoderTest, DT_INT64_2D) { + std::vector> values = {{1}}; + DenseDecoderTest(values, DT_INT64, {1, 1}); +} + +TEST(DenseDecoderTest, DT_FLOAT_scalar) { + float value = -0.6; + DenseDecoderTest(value, DT_FLOAT, {}); +} + +TEST(DenseDecoderTest, DT_FLOAT_1D) { + std::vector values = {1.5, 0.5, 1.7, 2.6}; + DenseDecoderTest(values, DT_FLOAT, {4}); +} + +TEST(DenseDecoderTest, DT_FLOAT_2D) { + std::vector> values = {{-0.1, -0.2, -0.3}, + {-1.4, 5.4, 6.6}}; + DenseDecoderTest(values, DT_FLOAT, {2, 3}); +} + +TEST(DenseDecoderTest, DT_DOUBLE_scalar) { + double value = -0.99; + DenseDecoderTest(value, DT_DOUBLE, {}); +} + +TEST(DenseDecoderTest, DT_DOUBLE_1D) { + std::vector values = {1.852, 0.79}; + DenseDecoderTest(values, DT_DOUBLE, {2}); +} + +TEST(DenseDecoderTest, DT_DOUBLE_2D) { + std::vector> values = {{-3.14, -2.07}}; + DenseDecoderTest(values, DT_DOUBLE, {1, 2}); +} + +TEST(DenseDecoderTest, DT_STRING_scalar) { + string value = "abc"; + DenseDecoderTest(value, DT_STRING, {}); +} + +TEST(DenseDecoderTest, DT_STRING_1D) { + std::vector values = {"", "", ""}; + DenseDecoderTest(values, DT_STRING, {3}); +} + +TEST(DenseDecoderTest, DT_STRING_2D) { + std::vector> values = {{"abc"}, {"ABC"}, {"LINKEDIN"}}; + DenseDecoderTest(values, DT_STRING, {3, 1}); +} + +TEST(DenseDecoderTest, DT_BYTES_scalar) { + byte_array value{0xb4, 0xaf, 0x98, 0x1a}; + DenseDecoderTest(value, DT_STRING, {}, avro::AVRO_BYTES); +} + +TEST(DenseDecoderTest, DT_BYTES_1D) { + byte_array v1{0xb4, 0xaf, 0x98, 0x1a}; + byte_array v2{0xb4, 0xaf, 0x98}; + byte_array v3{0xb4, 0x98, 0x1a}; + std::vector values = {v1, v2, v3}; + DenseDecoderTest(values, DT_STRING, {3}, avro::AVRO_BYTES); +} + +TEST(DenseDecoderTest, DT_BYTES_2D) { + byte_array v1{0xb4, 0xaf, 0x98, 0x1a}; + byte_array v2{0xb4, 0xaf, 0x98}; + byte_array v3{0xb4, 0x98, 0x1a}; + std::vector> values = {{v1}, {v2}, {v2}}; + DenseDecoderTest(values, DT_STRING, {3, 1}, avro::AVRO_BYTES); +} + +TEST(DenseDecoderTest, DT_BOOL_scalar) { + bool value = true; + DenseDecoderTest(value, DT_BOOL, {}); +} + +TEST(DenseDecoderTest, DT_BOOL_1D) { + std::vector values = {true, false, true}; + DenseDecoderTest(values, DT_BOOL, {3}); +} + +TEST(DenseDecoderTest, DT_BOOL_2D) { + std::vector> values = {{false, false}, {true, true}}; + DenseDecoderTest(values, DT_BOOL, {2, 2}); +} + +} // namespace dense +} // namespace atds +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/errors.cc b/tensorflow_io/core/kernels/avro/atds/errors.cc new file mode 100644 index 000000000..8d8b1e118 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/errors.cc @@ -0,0 +1,198 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/errors.h" + +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/strcat.h" + +namespace tensorflow { +namespace atds { + +namespace { +constexpr char kSupportedDTypeMessage[] = + "Only DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_STRING, and DT_BOOL are " + "supported."; +} // namespace + +void TypeNotSupportedAbort(DataType dtype) { + LOG(ERROR) << "Data type " << DataTypeString(dtype) << " is not supported. " + << kSupportedDTypeMessage; + std::abort(); +} + +void SparseIndicesTypeNotSupportedAbort(avro::Type indices_type) { + LOG(ERROR) << "Sparse indices type " << avro::toString(indices_type) + << " is not supported. Only AVRO_INT and AVRO_LONG are supported"; + std::abort(); +} + +Status TypeNotSupportedError(DataType dtype) { + return errors::InvalidArgument( + strings::StrCat("Data type ", DataTypeString(dtype), " is not supported.", + kSupportedDTypeMessage)); +} + +Status SparseArraysNotEqualError(const std::vector& decoded_numbers, + const std::vector& feature_index) { + size_t rank = decoded_numbers.size() - 1; + string array_names = "["; + string decoded_values = "["; + for (size_t i = 0; i <= rank; i++) { + if (i > 0) { + strings::StrAppend(&array_names, ", "); + strings::StrAppend(&decoded_values, ", "); + } + strings::StrAppend(&decoded_values, decoded_numbers[i]); + + size_t index = feature_index[i]; + if (index == rank) { + strings::StrAppend(&array_names, "values"); + } else { + strings::StrAppend(&array_names, "indices", index); + } + } + strings::StrAppend(&array_names, "]"); + strings::StrAppend(&decoded_values, "]"); + + return errors::InvalidArgument(strings::StrCat( + "Numbers of decoded value in indice and values array are different. ", + "Numbers of decoded value in ", array_names, " arrays are ", + decoded_values)); +} + +Status ShapeError(size_t number, int dim, const PartialTensorShape& shape) { + return errors::InvalidArgument(strings::StrCat( + "Number of decoded value ", number, + " does not match the expected dimension size ", shape.dim_size(dim), + " at the ", dim + 1, "th dimension in user defined shape ", + shape.DebugString())); +} + +Status NullValueError() { + return errors::InvalidArgument("Feature value is null."); +} + +Status FeatureDecodeError(const string& feature_name, const string& reason) { + return errors::InvalidArgument(strings::StrCat( + "Failed to decode feature ", feature_name, ". Reason: ", reason)); +} + +Status ATDSNotRecordError(const string& type, const string& schema) { + return errors::InvalidArgument( + strings::StrCat("ATDS schema is expected to be an Avro Record but found ", + type, ". Invalid schema found: ", schema)); +} + +Status FeatureNotFoundError(const string& feature_name, const string& schema) { + return errors::InvalidArgument(strings::StrCat( + "User defined feature '", feature_name, + "' cannot be found in the input data.", " Input data schema: ", schema)); +} + +Status InvalidUnionTypeError(const string& feature_name, const string& schema) { + return errors::InvalidArgument( + strings::StrCat("Feature '", feature_name, "' has invalid union schema. ", + "A feature can only be an union of itself or an union of " + "'null' type and itself.", + "Invalid union schema found: ", schema)); +} + +Status MissingValuesColumnError(const string& schema) { + return errors::InvalidArgument(strings::StrCat( + "Sparse schema is missing values column. Input data schema: ", schema)); +} + +Status NonContiguousIndicesError(const string& schema) { + return errors::InvalidArgument(strings::StrCat( + "Sparse schema indices should be contiguous (indices0, indices1, ...). ", + "Input data schema: ", schema)); +} + +Status ExtraFieldError(const string& schema) { + return errors::InvalidArgument( + strings::StrCat("Sparse schema can only contain 'indices' columns and a " + "'values' column. ", + "Input data schema: ", schema)); +} + +Status UnsupportedSparseIndicesTypeError(const string& feature_name, + const string& schema) { + return errors::InvalidArgument(strings::StrCat( + "Unsupported indices type found in feature '", feature_name, "'. ", + "Sparse tensor indices must be a non-nullable array of non-nullable int " + "or long. " + "Invalid schema found: ", + schema)); +} + +Status UnsupportedValueTypeError(const string& feature_name, + const string& schema) { + return errors::InvalidArgument(strings::StrCat( + "Unsupported value type found in feature '", feature_name, "'. ", + "Tensor value must be a non-nullable array of non-nullable int, long, " + "float, double, boolean, bytes, or string. " + "Invalid schema found: ", + schema)); +} + +Status SchemaValueTypeMismatch(const string& feature_name, avro::Type avro_type, + DataType metadata_type, const string& schema) { + return errors::InvalidArgument(strings::StrCat( + "Schema value type and metadata type mismatch in feature '", feature_name, + "'. ", "Avro schema data type: ", avro::toString(avro_type), + ", metadata type: ", DataTypeString(metadata_type), + ". Invalid schema found: ", schema)); +} + +Status InvalidDenseFeatureSchema(const string& feature_name, + const string& schema) { + return errors::InvalidArgument( + strings::StrCat("Dense feature '", feature_name, + "' must be non-nullable nested arrays only. ", + "Invalid schema found: ", schema)); +} + +Status InvalidVarlenFeatureSchema(const string& feature_name, + const string& schema) { + return errors::InvalidArgument( + strings::StrCat("Varlen feature '", feature_name, + "' must be non-nullable nested arrays only. ", + "Invalid schema found: ", schema)); +} + +Status FeatureRankMismatch(const string& feature_name, size_t avro_rank, + size_t metadata_rank, const string& schema) { + return errors::InvalidArgument(strings::StrCat( + "Mismatch between avro schema rank and metadata rank in feature '", + feature_name, "'. ", "Avro schema rank: ", std::to_string(avro_rank), + ", metadata rank: ", std::to_string(metadata_rank), ". ", + "Invalid schema found: ", schema)); +} + +Status VariedSchemaNotSupportedError(const string& expected_schema, + const string& filename, + const string& varied_schema, + const string& next_filename) { + return errors::InvalidArgument(strings::StrCat( + "Avro schema should be consistent for all input files.", + " Schema in file ", filename, " varies from the schema in file ", + next_filename, "\n", expected_schema, "\n != \n", varied_schema)); +} + +} // namespace atds +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/errors.h b/tensorflow_io/core/kernels/avro/atds/errors.h new file mode 100644 index 000000000..3dfe8d4ee --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/errors.h @@ -0,0 +1,80 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_ERRORS_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_ERRORS_H_ + +#include "api/Types.hh" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/status.h" + +namespace tensorflow { +namespace atds { + +void TypeNotSupportedAbort(DataType dtype); + +void SparseIndicesTypeNotSupportedAbort(avro::Type type); + +Status TypeNotSupportedError(DataType dtype); + +Status SparseArraysNotEqualError(const std::vector& decoded_numbers, + const std::vector& feature_index); + +Status ShapeError(size_t number, int dim, const PartialTensorShape& shape); + +Status NullValueError(); + +Status FeatureDecodeError(const string& feature_name, const string& reason); + +Status ATDSNotRecordError(const string& type, const string& schema); + +Status FeatureNotFoundError(const string& feature_name, const string& schema); + +Status InvalidUnionTypeError(const string& feature_name, const string& schema); + +Status MissingValuesColumnError(const string& schema); + +Status NonContiguousIndicesError(const string& schema); + +Status ExtraFieldError(const string& schema); + +Status UnsupportedSparseIndicesTypeError(const string& feature_name, + const string& schema); + +Status UnsupportedValueTypeError(const string& feature_name, + const string& schema); + +Status SchemaValueTypeMismatch(const string& feature_name, avro::Type avro_type, + DataType metadata_type, const string& schema); + +Status InvalidDenseFeatureSchema(const string& feature_name, + const string& schema); + +Status InvalidVarlenFeatureSchema(const string& feature_name, + const string& schema); + +Status FeatureRankMismatch(const string& feature_name, size_t avro_rank, + size_t metadata_rank, const string& schema); + +Status VariedSchemaNotSupportedError(const string& expected_schema, + const string& filename, + const string& varied_schema, + const string& next_filename); + +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_ERRORS_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/opaque_contextual_feature_decoder.h b/tensorflow_io/core/kernels/avro/atds/opaque_contextual_feature_decoder.h new file mode 100644 index 000000000..abd1adfca --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/opaque_contextual_feature_decoder.h @@ -0,0 +1,49 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_OPAQUE_CONTEXTUAL_FEATURE_DECODER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_OPAQUE_CONTEXTUAL_FEATURE_DECODER_H_ + +#include "api/Decoder.hh" +#include "api/Generic.hh" +#include "api/Specific.hh" +#include "tensorflow_io/core/kernels/avro/atds/decoder_base.h" + +namespace tensorflow { +namespace atds { +namespace opaque_contextual { + +class FeatureDecoder : public DecoderBase { + public: + explicit FeatureDecoder(size_t datum_index) : datum_index_(datum_index) {} + + Status operator()(avro::DecoderPtr& decoder, + std::vector& dense_tensors, + sparse::ValueBuffer& buffer, + std::vector& skipped_data, + size_t offset) { + avro::decode(*decoder, skipped_data[datum_index_]); + return OkStatus(); + } + + private: + const size_t datum_index_; +}; + +} // namespace opaque_contextual +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_OPAQUE_CONTEXTUAL_FEATURE_DECODER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/shuffle_handler.h b/tensorflow_io/core/kernels/avro/atds/shuffle_handler.h new file mode 100644 index 000000000..1e396d38e --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/shuffle_handler.h @@ -0,0 +1,120 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SHUFFLE_HANDLER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SHUFFLE_HANDLER_H_ + +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow_io/core/kernels/avro/atds/avro_block_reader.h" + +namespace tensorflow { +namespace data { + +class ShuffleHandler { + public: + ShuffleHandler(mutex* mu) { + mu_ = mu; + ResetRngs(); + } + void SampleBlocks(size_t batch_size, bool shuffle, + std::vector>& blocks) { + size_t i = 0; + size_t block_size = blocks.size(); + // LOG(INFO) << "shuffle batch size " << batch_size << " shuffle block size: + // " << block_size; + if (!shuffle) { + size_t j = 0; + while (i < batch_size) { + auto& random_block = blocks[j]; + random_block->num_to_decode = + std::min(random_block->object_count - random_block->num_decoded, + static_cast(batch_size - i)); + i += random_block->num_to_decode; + if ((random_block->num_decoded + random_block->num_to_decode) == + random_block->object_count) { + j++; + } + } + } else { + while (i < batch_size) { + size_t block_id = Random() % block_size; + // LOG(INFO) << "shuffle block size " << block_size << " block_id: " << + // block_id << " actual block size: " << blocks.size(); + auto& random_block = blocks[block_id]; + int64 remaining = random_block->object_count - + random_block->num_decoded - + random_block->num_to_decode; + if (remaining > 0) { + // Decode the whole block when it has less than 1/10 of the undecoded + // records. It is to quickly recycle the almost decoded blocks. + int64 decode_all_threshold = random_block->object_count / 10; + size_t decode_num = 1; + if (remaining <= decode_all_threshold) { + decode_num = + std::min(static_cast(remaining), batch_size - i); + } + random_block->num_to_decode += decode_num; + i += decode_num; + } + } + } + // update counts so that the elements don't have huge gaps + for (size_t k = 0; k < block_size; k++) { + blocks[k]->counts = blocks[k]->num_to_decode; + if (k > 0) { + blocks[k]->counts += blocks[k - 1]->counts; + } + // LOG(INFO) << "block " << k << " object count: " << + // blocks[k]->object_count << " counts: " << blocks[k]->counts + // << " num_decoded: " << blocks[k]->num_decoded << " + // num_to_decode: " << blocks[k]->num_to_decode; + } + } + // function to produce random numbers + random::SingleSampleAdapter::ResultType Random() + TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) { + num_random_samples_++; + return generator_->operator()(); + } + + void ResetRngs() TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) { + // Reset the generators based on the current iterator seeds. + int64 seed_ = random::New64(); + int64 seed2_ = random::New64(); + parent_generator_ = std::make_unique(seed_, seed2_); + generator_ = + std::make_unique>( + parent_generator_.get()); + generator_->Skip(num_random_samples_); + num_random_samples_ = 0; + } + + private: + // this is not owned by ShuffleHandler. This is owned by the calling class + mutex* mu_; + int64 num_random_samples_ TF_GUARDED_BY(*mu_) = 0; + std::unique_ptr parent_generator_ TF_GUARDED_BY(*mu_); + std::unique_ptr> generator_ + TF_GUARDED_BY(*mu_); +}; + +} // namespace data +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SHUFFLE_HANDLER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/shuffle_handler_test.cc b/tensorflow_io/core/kernels/avro/atds/shuffle_handler_test.cc new file mode 100644 index 000000000..0f7c3d01c --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/shuffle_handler_test.cc @@ -0,0 +1,108 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/shuffle_handler.h" + +#include "tensorflow/core/data/name_utils.h" +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/dataset.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/io/inputbuffer.h" +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/platform/blocking_counter.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow_io/core/kernels/avro/atds/avro_block_reader.h" + +namespace tensorflow { +namespace data { + +class ShuffleTest : public ::testing::Test { + protected: + ShuffleTest() { shuffle_handler_ = std::make_unique(&mu_); } + + void SetUp() override { + for (size_t i = 0; i < 10; i++) { + int64 rand_mult = static_cast(rand() % 5 + 5); + blocks_.emplace_back(std::make_unique(AvroBlock{ + rand_mult * 64, // int64_t object_count; + 0, // int64_t num_to_decode; + 0, // int64_t num_decoded; + 100000, // int64_t byte_count; + 0, // int64_t counts; + tstring("haha"), // tstring content; + avro::NULL_CODEC, // avro::Codec codec; + 4888 // size_t read_offset; + })); + } + } + mutex mu_; + std::unique_ptr shuffle_handler_; + std::vector> blocks_; +}; + +TEST_F(ShuffleTest, NoShuffleTest) { + size_t batch_size = 1024; + size_t shuffle_buffer_size = 0; + shuffle_handler_->SampleBlocks(batch_size, shuffle_buffer_size > 0, blocks_); + // assert that the sum of all num_to_decode == batch_size + size_t sum_of_num_to_decode = 0; + for (size_t i = 0; i < blocks_.size(); i++) { + sum_of_num_to_decode += blocks_[i]->num_to_decode; + } + EXPECT_EQ(sum_of_num_to_decode, batch_size); +} + +TEST_F(ShuffleTest, ShuffleBufferTest) { + size_t batch_size = 1024; + size_t shuffle_buffer_size = 2048; + shuffle_handler_->SampleBlocks(batch_size, shuffle_buffer_size > 0, blocks_); + // assert that the sum of all num_to_decode == batch_size + size_t sum_of_num_to_decode = 0; + for (size_t i = 0; i < blocks_.size(); i++) { + sum_of_num_to_decode += blocks_[i]->num_to_decode; + } + EXPECT_EQ(sum_of_num_to_decode, batch_size); +} + +TEST_F(ShuffleTest, UniformDistributionTest) { + const int64 bin_size = 10; + int64 bins[bin_size] = {0}; // observed frequencies + int64 error = 50; // none of the 10 bins will differ from the avg (1000 + // datapoints) by more than this + int64 num = 0; + int64 num_samples = 1000; + int64 avg = num_samples / bin_size; + int64 k = 0; + int64 idx = 0; + while (k < num_samples) { + num = shuffle_handler_->Random() % num_samples; + idx = num / + avg; // 0-99 goes to bucket 0, 100-199 goes to bucket 1 and so on. + bins[idx]++; + k++; + } + // check uniformity by ensuring that every bin is near the avg num of points + for (int i = 0; i < bin_size; i++) { + EXPECT_NEAR(bins[i], avg, error); + } +} + +} // namespace data +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder.h b/tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder.h new file mode 100644 index 000000000..067bf5c3f --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder.h @@ -0,0 +1,258 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_FEATURE_DECODER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_FEATURE_DECODER_H_ + +#include "api/Decoder.hh" +#include "api/Node.hh" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow_io/core/kernels/avro/atds/avro_decoder_template.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_base.h" +#include "tensorflow_io/core/kernels/avro/atds/errors.h" +#include "tensorflow_io/core/kernels/avro/atds/sparse_feature_internal_decoder.h" + +namespace tensorflow { +namespace atds { + +namespace sparse { + +struct Metadata { + Metadata(FeatureType type, const string& name, DataType dtype, + const PartialTensorShape& shape, size_t indices_index, + size_t values_index) + : type(type), + name(name), + dtype(dtype), + shape(shape), + indices_index(indices_index), + values_index(values_index) {} + + FeatureType type; + string name; + DataType dtype; + PartialTensorShape shape; + + size_t indices_index; + size_t values_index; +}; + +template +class FeatureDecoder : public DecoderBase { + public: + explicit FeatureDecoder(const Metadata& metadata, + const std::vector& decoding_order, + const std::vector& indices_type) + : metadata_(metadata), + rank_(metadata.shape.dims()), + decoding_order_(decoding_order), + long_indices_decoder_(metadata.indices_index, rank_), + int_indices_decoder_(metadata.indices_index, rank_), + values_decoder_(metadata.values_index) { + auto num_decoders = decoding_order.size(); + decoders_.reserve(num_decoders); + for (size_t i = 0; i < num_decoders; i++) { + auto index = decoding_order[i]; + if (index == rank_) { + decoders_.emplace_back(&values_decoder_); + } else if (indices_type[index] == avro::AVRO_LONG) { + decoders_.emplace_back(&long_indices_decoder_); + } else if (indices_type[index] == avro::AVRO_INT) { + decoders_.emplace_back(&int_indices_decoder_); + } else { + SparseIndicesTypeNotSupportedAbort(indices_type[index]); + } + } + } + + Status operator()(avro::DecoderPtr& decoder, + std::vector& dense_tensors, + sparse::ValueBuffer& buffer, + std::vector& skipped_data, + size_t offset) { + size_t num_decoders = decoders_.size(); + std::vector decoded_numbers(num_decoders, 0); + size_t indices_index = metadata_.indices_index; + size_t indices_start = buffer.indices[indices_index].size(); + for (size_t i = 0; i < num_decoders; i++) { + decoded_numbers[i] = decoders_[i]->Decode( + decoder, buffer, decoding_order_[i], indices_start); + } + + if (TF_PREDICT_FALSE(!std::all_of( + decoded_numbers.cbegin(), decoded_numbers.cend(), + [d = decoded_numbers[0]](size_t n) { return n == d; }))) { + return SparseArraysNotEqualError(decoded_numbers, decoding_order_); + } + + // Rank after batching equals to the number of decoders. + FillBatchIndices(buffer.indices[indices_index], indices_start, + static_cast(offset), num_decoders); + + auto& num_of_elements = buffer.num_of_elements[indices_index]; + size_t total_num_elements = decoded_numbers[0]; + if (!num_of_elements.empty()) { + total_num_elements += num_of_elements.back(); + } + num_of_elements.push_back(total_num_elements); + return OkStatus(); + } + + private: + void FillBatchIndices(std::vector& v, size_t indices_start, + long batch_offset, size_t rank_after_batch) { + size_t end = v.size(); + for (size_t i = indices_start; i < end; i += rank_after_batch) { + v[i] = batch_offset; + } + } + + const Metadata& metadata_; + const size_t rank_; + const std::vector decoding_order_; + IndicesDecoder long_indices_decoder_; + IndicesDecoder int_indices_decoder_; + ValuesDecoder values_decoder_; + std::vector decoders_; // not owned. +}; + +} // namespace sparse + +template <> +inline std::unique_ptr CreateFeatureDecoder( + const avro::NodePtr& node, const sparse::Metadata& metadata) { + size_t rank = static_cast(metadata.shape.dims()); + std::vector decoding_order(rank + 1); + std::vector indices_types(rank); + + for (size_t d = 0; d < rank; d++) { + auto indice_key = "indices" + std::to_string(d); + size_t indice_pos; + node->nameIndex(indice_key, indice_pos); + decoding_order[indice_pos] = d; + indices_types[d] = node->leafAt(indice_pos)->leafAt(0)->type(); + } + + size_t values_pos; + node->nameIndex("values", values_pos); + decoding_order[values_pos] = rank; + + switch (metadata.dtype) { + case DT_INT32: { + return std::move(std::make_unique>( + metadata, decoding_order, indices_types)); + } + case DT_INT64: { + return std::move(std::make_unique>( + metadata, decoding_order, indices_types)); + } + case DT_FLOAT: { + return std::move(std::make_unique>( + metadata, decoding_order, indices_types)); + } + case DT_DOUBLE: { + return std::move(std::make_unique>( + metadata, decoding_order, indices_types)); + } + case DT_STRING: { + return std::move(std::make_unique>( + metadata, decoding_order, indices_types)); + } + case DT_BOOL: { + return std::move(std::make_unique>( + metadata, decoding_order, indices_types)); + } + default: { + TypeNotSupportedAbort(metadata.dtype); + } + } + return nullptr; +} + +template <> +inline Status ValidateSchema(const avro::NodePtr& node, + const sparse::Metadata& metadata) { + size_t values_pos; + // Check values column exists. + if (!node->nameIndex("values", values_pos)) { + std::ostringstream oss; + node->printJson(oss, 0); + return MissingValuesColumnError(oss.str()); + } + // Check values column is a non-nullable array. + auto value_leaf = node->leafAt(values_pos); + avro::Type value_type = value_leaf->type(); + if (value_type != avro::AVRO_ARRAY) { + std::ostringstream oss; + node->printJson(oss, 0); + return UnsupportedValueTypeError(metadata.name, oss.str()); + } + avro::Type value_item_type = value_leaf->leafAt(0)->type(); + std::map::const_iterator tf_type = + avro_to_tf_datatype.find(value_item_type); + if (tf_type == avro_to_tf_datatype.end()) { + // Check schema data type is supported. + std::ostringstream oss; + node->printJson(oss, 0); + return UnsupportedValueTypeError(metadata.name, oss.str()); + } else if (tf_type->second != metadata.dtype) { + // Check schema data type and metadata type match. + std::ostringstream oss; + node->printJson(oss, 0); + return SchemaValueTypeMismatch(metadata.name, value_item_type, + metadata.dtype, oss.str()); + } + size_t rank = static_cast(metadata.shape.dims()); + for (size_t i = 0; i < rank; i++) { + auto indice_key = "indices" + std::to_string(i); + size_t indice_pos; + // Check for contiguous "indices0", "indices1", ... "indicesN" columns + if (!node->nameIndex(indice_key, indice_pos)) { + std::ostringstream oss; + node->printJson(oss, 0); + return NonContiguousIndicesError(oss.str()); + } + // Check each "indices" column is a non-nullable array. + auto indice_leaf = node->leafAt(indice_pos); + avro::Type indices_type = indice_leaf->type(); + if (indices_type != avro::AVRO_ARRAY) { + std::ostringstream oss; + node->printJson(oss, 0); + return UnsupportedSparseIndicesTypeError(metadata.name, oss.str()); + } + // Check each "indices" array consists of int or long. + avro::Type item_type = indice_leaf->leafAt(0)->type(); + if (item_type != avro::AVRO_INT && item_type != avro::AVRO_LONG) { + std::ostringstream oss; + node->printJson(oss, 0); + return UnsupportedSparseIndicesTypeError(metadata.name, oss.str()); + } + } + // Check schema rank and metadata rank match. + if (node->leaves() != rank + 1) { + std::ostringstream oss; + node->printJson(oss, 0); + return ExtraFieldError(oss.str()); + } + return OkStatus(); +} + +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_FEATURE_DECODER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder_test.cc b/tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder_test.cc new file mode 100644 index 000000000..f62dd1ae1 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder_test.cc @@ -0,0 +1,258 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/sparse_feature_decoder.h" + +#include "api/Decoder.hh" +#include "api/Stream.hh" +#include "api/ValidSchema.hh" +#include "tensorflow/core/platform/test.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_test_util.h" + +namespace tensorflow { +namespace atds { +namespace sparse { + +using Indices = std::vector>; + +template +void SparseDecoderTest(const Indices& indices, const std::vector& values, + const std::vector& order, + std::initializer_list shape, long offset, + const avro::Type avro_type = avro::AVRO_NULL) { + DataType dtype = GetDataType(); + string feature_name = "feature"; + ATDSSchemaBuilder schema_builder = ATDSSchemaBuilder(); + schema_builder.AddSparseFeature(feature_name, dtype, order, avro_type); + + string schema = schema_builder.Build(); + avro::ValidSchema writer_schema = schema_builder.BuildVaildSchema(); + avro::GenericDatum atds_datum(writer_schema); + AddSparseValue(atds_datum, feature_name, indices, values); + + avro::OutputStreamPtr out_stream = EncodeAvroGenericDatum(atds_datum); + avro::InputStreamPtr in_stream = avro::memoryInputStream(*out_stream); + avro::DecoderPtr decoder = avro::binaryDecoder(); + decoder->init(*in_stream); + + std::vector dense_features; + std::vector sparse_features; + std::vector varlen_features; + size_t indices_index = 0, values_index = 0; + PartialTensorShape tensor_shape(shape); + sparse_features.emplace_back(FeatureType::sparse, feature_name, dtype, + tensor_shape, indices_index, values_index); + + ATDSDecoder atds_decoder = + ATDSDecoder(dense_features, sparse_features, varlen_features); + Status init_status = atds_decoder.Initialize(writer_schema); + ASSERT_TRUE(init_status.ok()); + + std::vector skipped_data = atds_decoder.GetSkippedData(); + std::vector dense_tensors; + ValueBuffer buffer; + GetValuesBuffer(buffer).resize(1); + buffer.indices.resize(1); + buffer.num_of_elements.resize(1); + Status decode_status = atds_decoder.DecodeATDSDatum( + decoder, dense_tensors, buffer, skipped_data, offset); + ASSERT_TRUE(decode_status.ok()); + + auto rank = indices.size(); + auto num_elem = values.size(); + std::vector expected_indices((rank + 1) * num_elem, offset); + for (size_t i = 0; i < indices.size(); i++) { + auto dim = i + 1; + for (size_t j = 0; j < indices[i].size(); j++) { + expected_indices[dim + j * (rank + 1)] = indices[i][j]; + } + } + std::vector expected_num_elements = {num_elem}; + + ValidateBuffer(buffer, sparse_features[0], expected_indices, values, + expected_num_elements); +} + +template <> +inline void SparseDecoderTest(const Indices& indices, + const std::vector& values, + const std::vector& order, + std::initializer_list shape, long offset, + const avro::Type avro_type) { + DataType dtype = DT_STRING; + string feature_name = "feature"; + ATDSSchemaBuilder schema_builder = ATDSSchemaBuilder(); + schema_builder.AddSparseFeature(feature_name, dtype, order, avro_type); + + string schema = schema_builder.Build(); + avro::ValidSchema writer_schema = schema_builder.BuildVaildSchema(); + avro::GenericDatum atds_datum(writer_schema); + AddSparseValue(atds_datum, feature_name, indices, values); + + avro::OutputStreamPtr out_stream = EncodeAvroGenericDatum(atds_datum); + avro::InputStreamPtr in_stream = avro::memoryInputStream(*out_stream); + avro::DecoderPtr decoder = avro::binaryDecoder(); + decoder->init(*in_stream); + + std::vector dense_features; + std::vector sparse_features; + std::vector varlen_features; + size_t indices_index = 0, values_index = 0; + PartialTensorShape tensor_shape(shape); + sparse_features.emplace_back(FeatureType::sparse, feature_name, dtype, + tensor_shape, indices_index, values_index); + + ATDSDecoder atds_decoder = + ATDSDecoder(dense_features, sparse_features, varlen_features); + Status init_status = atds_decoder.Initialize(writer_schema); + ASSERT_TRUE(init_status.ok()); + + std::vector skipped_data = atds_decoder.GetSkippedData(); + std::vector dense_tensors; + ValueBuffer buffer; + GetValuesBuffer(buffer).resize(1); + buffer.indices.resize(1); + buffer.num_of_elements.resize(1); + Status decode_status = atds_decoder.DecodeATDSDatum( + decoder, dense_tensors, buffer, skipped_data, offset); + ASSERT_TRUE(decode_status.ok()); + + auto rank = indices.size(); + auto num_elem = values.size(); + std::vector expected_indices((rank + 1) * num_elem, offset); + for (size_t i = 0; i < indices.size(); i++) { + auto dim = i + 1; + for (size_t j = 0; j < indices[i].size(); j++) { + expected_indices[dim + j * (rank + 1)] = indices[i][j]; + } + } + std::vector expected_num_elements = {num_elem}; + + ValidateBuffer(buffer, sparse_features[0], expected_indices, values, + expected_num_elements); +} + +TEST(SparseDecoderTest, DT_INT32_1D) { + std::vector values = {1, 2, 3}; + SparseDecoderTest({{1, 3, 5}}, values, {0, 1}, {10}, 0); +} + +TEST(SparseDecoderTest, DT_INT32_2D) { + std::vector values = {-1, 2}; + SparseDecoderTest({{3, 5}, {2, 4}}, values, {0, 1, 2}, {10, 5}, 0); +} + +TEST(SparseDecoderTest, DT_INT64_1D) { + std::vector values = {4}; + SparseDecoderTest({{1}}, values, {0, 1}, {100}, 0); +} + +TEST(SparseDecoderTest, DT_INT64_2D) { + std::vector values = {77, 99, 131, 121}; + SparseDecoderTest({{3, 3, 3, 3}, {2, 4, 6, 8}}, values, {0, 1, 2}, {10, 9}, + 0); +} + +TEST(SparseDecoderTest, DT_FLOAT_1D) { + std::vector values = {0.0}; + SparseDecoderTest({{0}}, values, {0, 1}, {10}, 0); +} + +TEST(SparseDecoderTest, DT_FLOAT_2D) { + std::vector values = {1.0, 0.0}; + SparseDecoderTest({{3, 5}, {2, 4}}, values, {0, 1, 2}, {10, 5}, 0); +} + +TEST(SparseDecoderTest, DT_DOUBLE_1D) { + std::vector values = {1.0, 2.0, 3.0}; + SparseDecoderTest({{1, 3, 5}}, values, {0, 1}, {256}, 0); +} + +TEST(SparseDecoderTest, DT_DOUBLE_2D) { + std::vector values = {0.77, 0.3145}; + SparseDecoderTest({{0, 1}, {0, 1}}, values, {0, 1, 2}, {2, 2}, 0); +} + +TEST(SparseDecoderTest, DT_STRING_1D) { + std::vector values = {"abc"}; + SparseDecoderTest({{1}}, values, {0, 1}, {100}, 0); +} + +TEST(SparseDecoderTest, DT_STRING_2D) { + std::vector values = {"abc", "cdf", "pdf", "rdf"}; + SparseDecoderTest({{1000, 1200, 98742, 919101}, {10101, 9291, 0, 191}}, + values, {0, 1, 2}, {1000000, 12000}, 0); +} + +TEST(SparseDecoderTest, DT_BYTES_1D) { + byte_array value = {0xb4, 0xaf, 0x98, 0x1a}; + std::vector values = {value}; + SparseDecoderTest({{1}}, values, {0, 1}, {100}, 0, avro::AVRO_BYTES); +} + +TEST(SparseDecoderTest, DT_BYTES_2D) { + byte_array v1{0xb4, 0xaf, 0x98, 0x1a}; + byte_array v2{0xb4, 0xaf, 0x98}; + byte_array v3{0xb4, 0x98, 0x1a}; + byte_array v4{0xb4, 0x98}; + std::vector values = {v1, v2, v3, v4}; + SparseDecoderTest({{1000, 1200, 98742, 919101}, {10101, 9291, 0, 191}}, + values, {0, 1, 2}, {1000000, 12000}, 0, avro::AVRO_BYTES); +} + +TEST(SparseDecoderTest, DT_BOOL_1D) { + std::vector values = {true, false, true}; + SparseDecoderTest({{0, 1, 2}}, values, {0, 1}, {10}, 0); +} + +TEST(SparseDecoderTest, DT_BOOL_2D) { + std::vector values = {false, false, true}; + SparseDecoderTest({{3, 5, 5}, {2, 4, 8}}, values, {0, 1, 2}, {10, 10}, 0); +} + +TEST(SparseDecoderTest, 2D_Order_0_2_1) { + std::vector values = {-1, 2}; + SparseDecoderTest({{3, 5}, {2, 4}}, values, {0, 2, 1}, {10, 5}, 0); +} + +TEST(SparseDecoderTest, 2D_Order_2_0_1) { + std::vector values = {-1, 2}; + SparseDecoderTest({{3, 5}, {2, 4}}, values, {2, 0, 1}, {10, 5}, 0); +} + +TEST(SparseDecoderTest, 2D_Order_2_1_0) { + std::vector values = {-1, 2}; + SparseDecoderTest({{3, 5}, {2, 4}}, values, {2, 1, 0}, {10, 5}, 0); +} + +TEST(SparseDecoderTest, 2D_Order_1_2_0) { + std::vector values = {-1, 2}; + SparseDecoderTest({{3, 5}, {2, 4}}, values, {1, 2, 0}, {10, 5}, 0); +} + +TEST(SparseDecoderTest, 2D_Order_1_0_2) { + std::vector values = {-1, 2}; + SparseDecoderTest({{3, 5}, {2, 4}}, values, {1, 0, 2}, {10, 5}, 0); +} + +TEST(SparseDecoderTest, NonZeroOffset) { + std::vector values = {77, 99, 131, 121}; + SparseDecoderTest({{3, 3, 3, 3}, {2, 4, 6, 8}}, values, {0, 1, 2}, {10, 9}, + 99); +} + +} // namespace sparse +} // namespace atds +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/sparse_feature_internal_decoder.h b/tensorflow_io/core/kernels/avro/atds/sparse_feature_internal_decoder.h new file mode 100644 index 000000000..3f717c920 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/sparse_feature_internal_decoder.h @@ -0,0 +1,150 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_FEATURE_INTERNAL_DECODER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_FEATURE_INTERNAL_DECODER_H_ + +#include "api/Decoder.hh" +#include "tensorflow_io/core/kernels/avro/atds/avro_decoder_template.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_base.h" + +namespace tensorflow { +namespace atds { +namespace sparse { + +template +inline size_t DecodeVarLenValues(avro::DecoderPtr& decoder, std::vector& v) { + size_t count = 0; + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + count += m; + for (size_t i = 0; i < m; i++) { + v.emplace_back(avro::decoder_t::Decode(decoder)); + } + } + return count; +} + +// This template specification handles both byte and string. +// It assumes that avro decodeBytes and decodeString are both reading bytes into +// uint8 arrays see: +// https://github.com/apache/avro/blob/branch-1.9/lang/c%2B%2B/impl/BinaryDecoder.cc#L133 +// As long as that as that assumption holds a separate bytes implementation is +// not required. +template <> +inline size_t DecodeVarLenValues(avro::DecoderPtr& decoder, + std::vector& v) { + size_t count = 0; + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + count += m; + for (size_t i = 0; i < m; i++) { + v.push_back(""); + decoder->decodeString(v.back()); + } + } + return count; +} + +class InternalDecoder { + public: + virtual ~InternalDecoder() {} + + virtual size_t Decode(avro::DecoderPtr& decoder, ValueBuffer& buffer, + size_t dim, size_t indices_start) = 0; +}; + +template +class ValuesDecoder : public InternalDecoder { + public: + explicit ValuesDecoder(size_t values_index) : values_index_(values_index) {} + + // Two size_t parameters are only used in IndicesDecoder. + size_t Decode(avro::DecoderPtr& decoder, ValueBuffer& buffer, + size_t not_used_1, size_t not_used_2) { + return DecodeVarLenValues(decoder, + GetValueVector(buffer, values_index_)); + } + + private: + const size_t values_index_; +}; + +template < + typename T, + typename = typename std::enable_if< + std::is_same::value || std::is_same::value, T>::type> +class IndicesDecoder : public InternalDecoder { + public: + explicit IndicesDecoder(size_t indices_index, size_t rank) + : indices_index_(indices_index), rank_after_batch_(rank + 1) {} + + size_t Decode(avro::DecoderPtr& decoder, ValueBuffer& buffer, size_t dim, + size_t indices_start) { + auto& v = buffer.indices[indices_index_]; + size_t count = 0; + size_t start = indices_start; + auto dim_after_batch = dim + 1; + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + count += m; + size_t end = start + m * rank_after_batch_; + if (end > v.capacity()) { + v.reserve(2 * v.capacity()); + } + if (end > v.size()) { + v.resize(end); + } + for (size_t i = start + dim_after_batch; i < end; + i += rank_after_batch_) { + v[i] = static_cast(avro::decoder_t::Decode(decoder)); + } + start = end; + } + return count; + } + + private: + const size_t indices_index_; + const size_t rank_after_batch_; +}; + +template <> +inline size_t IndicesDecoder::Decode(avro::DecoderPtr& decoder, + ValueBuffer& buffer, size_t dim, + size_t indices_start) { + auto& v = buffer.indices[indices_index_]; + size_t count = 0; + size_t start = indices_start; + auto dim_after_batch = dim + 1; + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + count += m; + size_t end = start + m * rank_after_batch_; + if (end > v.capacity()) { + v.reserve(2 * v.capacity()); + } + if (end > v.size()) { + v.resize(end); + } + for (size_t i = start + dim_after_batch; i < end; i += rank_after_batch_) { + v[i] = decoder->decodeLong(); + } + start = end; + } + return count; +} + +} // namespace sparse +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_FEATURE_INTERNAL_DECODER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/sparse_value_buffer.h b/tensorflow_io/core/kernels/avro/atds/sparse_value_buffer.h new file mode 100644 index 000000000..30440d388 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/sparse_value_buffer.h @@ -0,0 +1,195 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_VALUE_BUFFER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_VALUE_BUFFER_H_ + +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow_io/core/kernels/avro/atds/errors.h" + +namespace tensorflow { +namespace atds { +namespace sparse { + +template +using vecvec = std::vector>; + +struct ValueBuffer { + vecvec int_values; + vecvec long_values; + vecvec float_values; + vecvec double_values; + vecvec bool_values; + vecvec string_values; + + vecvec indices; + vecvec num_of_elements; +}; + +template +std::vector& GetValueVector(ValueBuffer& buffer, size_t index); + +template <> +inline std::vector& GetValueVector(ValueBuffer& buffer, size_t index) { + return buffer.int_values[index]; +} + +template <> +inline std::vector& GetValueVector(ValueBuffer& buffer, size_t index) { + return buffer.long_values[index]; +} + +template <> +inline std::vector& GetValueVector(ValueBuffer& buffer, size_t index) { + return buffer.float_values[index]; +} + +template <> +inline std::vector& GetValueVector(ValueBuffer& buffer, size_t index) { + return buffer.double_values[index]; +} + +template <> +inline std::vector& GetValueVector(ValueBuffer& buffer, size_t index) { + return buffer.string_values[index]; +} + +template <> +inline std::vector& GetValueVector(ValueBuffer& buffer, size_t index) { + return buffer.bool_values[index]; +} + +template +const std::vector& GetValueVector(const ValueBuffer& buffer, size_t index); + +template <> +inline const std::vector& GetValueVector(const ValueBuffer& buffer, + size_t index) { + return buffer.int_values[index]; +} + +template <> +inline const std::vector& GetValueVector(const ValueBuffer& buffer, + size_t index) { + return buffer.long_values[index]; +} + +template <> +inline const std::vector& GetValueVector(const ValueBuffer& buffer, + size_t index) { + return buffer.float_values[index]; +} + +template <> +inline const std::vector& GetValueVector(const ValueBuffer& buffer, + size_t index) { + return buffer.double_values[index]; +} + +template <> +inline const std::vector& GetValueVector(const ValueBuffer& buffer, + size_t index) { + return buffer.string_values[index]; +} + +template <> +inline const std::vector& GetValueVector(const ValueBuffer& buffer, + size_t index) { + return buffer.bool_values[index]; +} + +inline Status FillIndicesTensor(const std::vector& buffer, Tensor& tensor, + size_t offset) { + void* dest = + reinterpret_cast(reinterpret_cast(tensor.data()) + offset); + const void* src = reinterpret_cast(buffer.data()); + size_t len = buffer.size() * sizeof(long); + std::memcpy(dest, src, len); + return OkStatus(); +} + +template +inline Status FillValuesTensor(const sparse::ValueBuffer& buffer, + Tensor& tensor, size_t values_index, + size_t offset) { + auto& values = GetValueVector(buffer, values_index); + void* dest = + reinterpret_cast(reinterpret_cast(tensor.data()) + offset); + const void* src = reinterpret_cast(values.data()); + size_t len = values.size() * sizeof(T); + std::memcpy(dest, src, len); + return OkStatus(); +} + +template <> +inline Status FillValuesTensor(const sparse::ValueBuffer& buffer, + Tensor& tensor, size_t values_index, + size_t offset) { + auto& values = buffer.string_values[values_index]; + for (size_t i = 0; i < values.size(); i++) { + tensor.flat()(offset++) = std::move(values[i]); + } + return OkStatus(); +} + +template <> +inline Status FillValuesTensor(const sparse::ValueBuffer& buffer, + Tensor& tensor, size_t values_index, + size_t offset) { + auto& values = buffer.bool_values[values_index]; + for (size_t i = 0; i < values.size(); i++) { + tensor.flat()(offset++) = values[i]; + } + return OkStatus(); +} + +inline Status FillValuesTensor(const sparse::ValueBuffer& buffer, + Tensor& values_tensor, DataType dtype, + size_t values_index, size_t offset) { + switch (dtype) { + case DT_INT32: { + return FillValuesTensor(buffer, values_tensor, values_index, offset); + } + case DT_INT64: { + return FillValuesTensor(buffer, values_tensor, values_index, + offset); + } + case DT_FLOAT: { + return FillValuesTensor(buffer, values_tensor, values_index, + offset); + } + case DT_DOUBLE: { + return FillValuesTensor(buffer, values_tensor, values_index, + offset); + } + case DT_STRING: { + return FillValuesTensor(buffer, values_tensor, values_index, + offset); + } + case DT_BOOL: { + return FillValuesTensor(buffer, values_tensor, values_index, + offset); + } + default: { + return TypeNotSupportedError(dtype); + } + } +} + +} // namespace sparse +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_SPARSE_VALUE_BUFFER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/sparse_value_buffer_test.cc b/tensorflow_io/core/kernels/avro/atds/sparse_value_buffer_test.cc new file mode 100644 index 000000000..e002ad32c --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/sparse_value_buffer_test.cc @@ -0,0 +1,90 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/sparse_value_buffer.h" + +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_test_util.h" + +namespace tensorflow { +namespace atds { +namespace sparse { + +class FillIndicesTensorTest : public ::testing::TestWithParam {}; + +TEST_P(FillIndicesTensorTest, Offset) { + std::vector buffer = {1, 3, 5, 7}; + size_t offset = GetParam(); + int64 limit = static_cast(buffer.size() + offset); + Tensor tensor(DT_INT64, {limit}); + Status status = FillIndicesTensor(buffer, tensor, offset); + ASSERT_TRUE(status.ok()); + AssertTensorRangeEqual(tensor, buffer, offset); +} + +INSTANTIATE_TEST_SUITE_P(offset_0_1_2, FillIndicesTensorTest, + ::testing::Values(0, 1, 2)); + +template +void FillValuesTensorTest(const std::vector& values, size_t values_index, + size_t offset) { + DataType dtype = GetDataType(); + + sparse::ValueBuffer buffer; + auto& values_buffer = GetValuesBuffer(buffer); + values_buffer.resize(values_index + 1); + values_buffer.back() = values; + int64 size = static_cast(offset + values.size()); + Tensor tensor(dtype, {size}); + + Status status = FillValuesTensor(buffer, tensor, dtype, values_index, offset); + ASSERT_TRUE(status.ok()); + AssertTensorRangeEqual(tensor, values, offset); +} + +TEST(FillValuesTensorTest, DT_INT32) { + std::vector values = {3, 2, 1, -1}; + FillValuesTensorTest(values, 0, 0); +} + +TEST(FillValuesTensorTest, DT_INT64) { + std::vector values = {-1, -2}; + FillValuesTensorTest(values, 1, 0); +} + +TEST(FillValuesTensorTest, DT_FLOAT) { + std::vector values = {0.0, 1.0, -1.0}; + FillValuesTensorTest(values, 0, 2); +} + +TEST(FillValuesTensorTest, DT_DOUBLE) { + std::vector values = {3.17, 4.02, 5.13}; + FillValuesTensorTest(values, 11, 11); +} + +TEST(FillValuesTensorTest, DT_STRING) { + std::vector values = {"ABC"}; + FillValuesTensorTest(values, 7, 0); +} + +TEST(FillValuesTensorTest, DT_BOOL) { + std::vector values = {false, true, true}; + FillValuesTensorTest(values, 0, 5); +} + +} // namespace sparse +} // namespace atds +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder.h b/tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder.h new file mode 100644 index 000000000..6ad179a55 --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder.h @@ -0,0 +1,337 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_RAGGED_FEATURE_DECODER_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_RAGGED_FEATURE_DECODER_H_ + +#include "api/Decoder.hh" +#include "api/Node.hh" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow_io/core/kernels/avro/atds/avro_decoder_template.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_base.h" +#include "tensorflow_io/core/kernels/avro/atds/errors.h" +#include "tensorflow_io/core/kernels/avro/atds/sparse_value_buffer.h" + +namespace tensorflow { +namespace atds { + +namespace varlen { + +struct Metadata { + Metadata(FeatureType type, const string& name, DataType dtype, + const PartialTensorShape& shape, size_t indices_index, + size_t values_index) + : type(type), + name(name), + dtype(dtype), + shape(shape), + indices_index(indices_index), + values_index(values_index) {} + + FeatureType type; + string name; + DataType dtype; + PartialTensorShape shape; + + size_t indices_index; + size_t values_index; +}; + +inline void FillIndicesBuffer(std::vector& indices_buf, + std::vector& current_indice) { + for (const auto& indice_dim : current_indice) { + indices_buf.emplace_back(indice_dim); + } +} + +template +inline Status DecodeVarlenArray(avro::DecoderPtr& decoder, + std::vector& indices_buf, + std::vector& values_buf, + std::vector& current_indice, int rank, + const PartialTensorShape& shape) { + if (rank == 0) { + FillIndicesBuffer(indices_buf, current_indice); + values_buf.emplace_back(avro::decoder_t::Decode(decoder)); + return OkStatus(); + } + + current_indice.emplace_back(0); + int dim = shape.dims() - rank; + int64 size = shape.dim_size(dim); + int64 number = 0; + if (size > 0) { + // slow path with dimension check. + if (rank == 1) { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + number += static_cast(m); + if (TF_PREDICT_FALSE(number > size)) { + return ShapeError(number, dim, shape); + } + for (size_t i = 0; i < m; i++) { + FillIndicesBuffer(indices_buf, current_indice); + values_buf.emplace_back(avro::decoder_t::Decode(decoder)); + current_indice.back()++; + } + } + } else { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + number += static_cast(m); + if (TF_PREDICT_FALSE(number > size)) { + return ShapeError(number, dim, shape); + } + for (size_t i = 0; i < m; i++) { + TF_RETURN_IF_ERROR(DecodeVarlenArray(decoder, indices_buf, + values_buf, current_indice, + rank - 1, shape)); + current_indice.back()++; + } + } + } + if (TF_PREDICT_FALSE(number != size)) { + return ShapeError(number, dim, shape); + } + } else { + // fast path without dimension check as the dimension can have unlimited + // values. + if (rank == 1) { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + for (size_t i = 0; i < m; i++) { + FillIndicesBuffer(indices_buf, current_indice); + values_buf.emplace_back(avro::decoder_t::Decode(decoder)); + current_indice.back()++; + } + } + } else { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + for (size_t i = 0; i < m; i++) { + TF_RETURN_IF_ERROR(DecodeVarlenArray(decoder, indices_buf, + values_buf, current_indice, + rank - 1, shape)); + current_indice.back()++; + } + } + } + } + + current_indice.pop_back(); + return OkStatus(); +} + +// This template specification handles both byte and string. +// It assumes that avro decodeBytes and decodeString are both reading bytes into +// uint8 arrays see: +// https://github.com/apache/avro/blob/branch-1.9/lang/c%2B%2B/impl/BinaryDecoder.cc#L133 +// As long as that as that assumption holds a separate bytes implementation is +// not required. +template <> +inline Status DecodeVarlenArray(avro::DecoderPtr& decoder, + std::vector& indices_buf, + std::vector& values_buf, + std::vector& current_indice, int rank, + const PartialTensorShape& shape) { + if (rank == 0) { + FillIndicesBuffer(indices_buf, current_indice); + values_buf.push_back(""); + decoder->decodeString(values_buf.back()); + return OkStatus(); + } + + current_indice.emplace_back(0); + int dim = shape.dims() - rank; + int64 size = shape.dim_size(dim); + int64 number = 0; + if (size > 0) { + // slow path with dimension check. + if (rank == 1) { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + number += static_cast(m); + if (TF_PREDICT_FALSE(number > size)) { + return ShapeError(number, dim, shape); + } + for (size_t i = 0; i < m; i++) { + FillIndicesBuffer(indices_buf, current_indice); + values_buf.push_back(""); + decoder->decodeString(values_buf.back()); + current_indice.back()++; + } + } + } else { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + number += static_cast(m); + if (TF_PREDICT_FALSE(number > size)) { + return ShapeError(number, dim, shape); + } + for (size_t i = 0; i < m; i++) { + TF_RETURN_IF_ERROR(DecodeVarlenArray(decoder, indices_buf, values_buf, + current_indice, rank - 1, + shape)); + current_indice.back()++; + } + } + } + if (TF_PREDICT_FALSE(number != size)) { + return ShapeError(number, dim, shape); + } + } else { + // fast path without dimension check as the dimension can have unlimited + // values. + if (rank == 1) { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + for (size_t i = 0; i < m; i++) { + FillIndicesBuffer(indices_buf, current_indice); + values_buf.push_back(""); + decoder->decodeString(values_buf.back()); + current_indice.back()++; + } + } + } else { + for (size_t m = decoder->arrayStart(); m != 0; m = decoder->arrayNext()) { + for (size_t i = 0; i < m; i++) { + TF_RETURN_IF_ERROR(DecodeVarlenArray(decoder, indices_buf, values_buf, + current_indice, rank - 1, + shape)); + current_indice.back()++; + } + } + } + } + + current_indice.pop_back(); + return OkStatus(); +} + +template +class FeatureDecoder : public DecoderBase { + public: + explicit FeatureDecoder(const Metadata& metadata) + : metadata_(metadata), rank_(metadata.shape.dims()) {} + + Status operator()(avro::DecoderPtr& decoder, + std::vector& dense_tensors, + sparse::ValueBuffer& buffer, + std::vector& skipped_data, + size_t offset) { + // declaring std::vector locally to make it thread safe + std::vector current_indices; + current_indices.reserve(rank_ + 1); // additional batch dim. + current_indices.resize(1); + current_indices[0] = offset; + size_t indices_index = metadata_.indices_index; + + auto& indices_buf = buffer.indices[indices_index]; + auto& values_buf = + sparse::GetValueVector(buffer, metadata_.values_index); + size_t values_buf_size = values_buf.size(); + TF_RETURN_IF_ERROR(DecodeVarlenArray(decoder, indices_buf, values_buf, + current_indices, rank_, + metadata_.shape)); + size_t total_num_elements = values_buf.size() - values_buf_size; + auto& num_of_elements = buffer.num_of_elements[indices_index]; + if (!num_of_elements.empty()) { + total_num_elements += num_of_elements.back(); + } + num_of_elements.push_back(total_num_elements); + return OkStatus(); + } + + private: + const Metadata& metadata_; + const int rank_; +}; + +} // namespace varlen + +template <> +inline std::unique_ptr CreateFeatureDecoder( + const avro::NodePtr& node, const varlen::Metadata& metadata) { + switch (metadata.dtype) { + case DT_INT32: { + return std::move(std::make_unique>(metadata)); + } + case DT_INT64: { + return std::move( + std::make_unique>(metadata)); + } + case DT_FLOAT: { + return std::move( + std::make_unique>(metadata)); + } + case DT_DOUBLE: { + return std::move( + std::make_unique>(metadata)); + } + case DT_STRING: { + return std::move( + std::make_unique>(metadata)); + } + case DT_BOOL: { + return std::move( + std::make_unique>(metadata)); + } + default: { + TypeNotSupportedAbort(metadata.dtype); + } + } + return nullptr; +} + +template <> +inline Status ValidateSchema(const avro::NodePtr& node, + const varlen::Metadata& metadata) { + avro::NodePtr n = node; + size_t avro_rank = 0; + // Check schema consists of non-nullable nested arrays. + while (n->leaves() != 0) { + if (n->leaves() != 1 || n->type() != avro::AVRO_ARRAY) { + std::ostringstream oss; + n->printJson(oss, 0); + return InvalidVarlenFeatureSchema(metadata.name, oss.str()); + } + n = n->leafAt(0); + avro_rank++; + } + avro::Type avro_type = n->type(); + std::map::const_iterator tf_type = + avro_to_tf_datatype.find(avro_type); + if (tf_type == avro_to_tf_datatype.end()) { + // Check schema data type is supported. + std::ostringstream oss; + node->printJson(oss, 0); + return UnsupportedValueTypeError(metadata.name, oss.str()); + } else if (tf_type->second != metadata.dtype) { + // Check schema data type and metadata type match. + std::ostringstream oss; + node->printJson(oss, 0); + return SchemaValueTypeMismatch(metadata.name, avro_type, metadata.dtype, + oss.str()); + } + // Check schema rank and metadata rank match. + size_t metadata_rank = static_cast(metadata.shape.dims()); + if (avro_rank != metadata_rank) { + std::ostringstream oss; + node->printJson(oss, 0); + return FeatureRankMismatch(metadata.name, avro_rank, metadata_rank, + oss.str()); + } + return OkStatus(); +} + +} // namespace atds +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_RAGGED_FEATURE_DECODER_H_ diff --git a/tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder_test.cc b/tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder_test.cc new file mode 100644 index 000000000..f1bf9464e --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder_test.cc @@ -0,0 +1,333 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds/varlen_feature_decoder.h" + +#include "api/Decoder.hh" +#include "api/Stream.hh" +#include "api/ValidSchema.hh" +#include "tensorflow/core/platform/test.h" +#include "tensorflow_io/core/kernels/avro/atds/decoder_test_util.h" + +namespace tensorflow { +namespace atds { +namespace varlen { + +template +void VarlenDecoderTest(const T& values, DataType dtype, + std::initializer_list shape, + const std::vector& expected_indices, + const std::vector& expected_values, long offset, + const avro::Type avro_type = avro::AVRO_NULL) { + string feature_name = "feature"; + ATDSSchemaBuilder schema_builder = ATDSSchemaBuilder(); + schema_builder.AddDenseFeature(feature_name, dtype, shape.size(), avro_type); + + string schema = schema_builder.Build(); + avro::ValidSchema writer_schema = schema_builder.BuildVaildSchema(); + avro::GenericDatum atds_datum(writer_schema); + AddDenseValue(atds_datum, feature_name, values); + + avro::OutputStreamPtr out_stream = EncodeAvroGenericDatum(atds_datum); + avro::InputStreamPtr in_stream = avro::memoryInputStream(*out_stream); + avro::DecoderPtr decoder = avro::binaryDecoder(); + decoder->init(*in_stream); + + std::vector dense_features; + std::vector sparse_features; + std::vector varlen_features; + size_t indices_index = 0, values_index = 0; + PartialTensorShape tensor_shape(shape); + varlen_features.emplace_back(FeatureType::varlen, feature_name, dtype, + tensor_shape, indices_index, values_index); + + ATDSDecoder atds_decoder = + ATDSDecoder(dense_features, sparse_features, varlen_features); + Status init_status = atds_decoder.Initialize(writer_schema); + ASSERT_TRUE(init_status.ok()); + + std::vector skipped_data = atds_decoder.GetSkippedData(); + std::vector dense_tensors; + sparse::ValueBuffer buffer; + sparse::GetValuesBuffer(buffer).resize(1); + buffer.indices.resize(1); + buffer.num_of_elements.resize(1); + Status decode_status = + atds_decoder.DecodeATDSDatum(decoder, dense_tensors, buffer, skipped_data, + static_cast(offset)); + ASSERT_TRUE(decode_status.ok()); + + std::vector expected_num_elements = {expected_values.size()}; + + ValidateBuffer(buffer, varlen_features[0], expected_indices, expected_values, + expected_num_elements); +} + +template +inline void VarlenDecoderTest(const T& values, DataType dtype, + std::initializer_list shape, + const std::vector& expected_indices, + const std::vector& expected_values, + long offset, const avro::Type avro_type) { + string feature_name = "feature"; + ATDSSchemaBuilder schema_builder = ATDSSchemaBuilder(); + schema_builder.AddDenseFeature(feature_name, dtype, shape.size(), avro_type); + + string schema = schema_builder.Build(); + avro::ValidSchema writer_schema = schema_builder.BuildVaildSchema(); + avro::GenericDatum atds_datum(writer_schema); + AddDenseValue(atds_datum, feature_name, values); + + avro::OutputStreamPtr out_stream = EncodeAvroGenericDatum(atds_datum); + avro::InputStreamPtr in_stream = avro::memoryInputStream(*out_stream); + avro::DecoderPtr decoder = avro::binaryDecoder(); + decoder->init(*in_stream); + + std::vector dense_features; + std::vector sparse_features; + std::vector varlen_features; + size_t indices_index = 0, values_index = 0; + PartialTensorShape tensor_shape(shape); + varlen_features.emplace_back(FeatureType::varlen, feature_name, dtype, + tensor_shape, indices_index, values_index); + + ATDSDecoder atds_decoder = + ATDSDecoder(dense_features, sparse_features, varlen_features); + Status init_status = atds_decoder.Initialize(writer_schema); + ASSERT_TRUE(init_status.ok()); + + std::vector skipped_data = atds_decoder.GetSkippedData(); + std::vector dense_tensors; + sparse::ValueBuffer buffer; + sparse::GetValuesBuffer(buffer).resize(1); + buffer.indices.resize(1); + buffer.num_of_elements.resize(1); + Status decode_status = + atds_decoder.DecodeATDSDatum(decoder, dense_tensors, buffer, skipped_data, + static_cast(offset)); + ASSERT_TRUE(decode_status.ok()); + + std::vector expected_num_elements = {expected_values.size()}; + + ValidateBuffer(buffer, varlen_features[0], expected_indices, expected_values, + expected_num_elements); +} + +TEST(VarlenDecoderTest, DT_INT32_scalar) { + int value = -7; + long offset = 1; + std::vector expected_indices = {offset}; + std::vector expected_values = {value}; + + VarlenDecoderTest(value, DT_INT32, {}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_INT32_1D) { + std::vector values = {1, 2, 3}; + long offset = 9; + std::vector expected_indices = {offset, 0, offset, 1, offset, 2}; + std::vector expected_values = values; + + VarlenDecoderTest(values, DT_INT32, {-1}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_INT32_2D) { + std::vector> values = {{-1}, {4, 5, 6}, {-7, 8}}; + long offset = 16; + std::vector expected_indices = {offset, 0, 0, offset, 1, 0, + offset, 1, 1, offset, 1, 2, + offset, 2, 0, offset, 2, 1}; + std::vector expected_values = {-1, 4, 5, 6, -7, 8}; + + VarlenDecoderTest(values, DT_INT32, {3, -1}, expected_indices, + expected_values, offset); +} + +TEST(VarlenDecoderTest, DT_INT64_scalar) { + long value = 1; + long offset = 0; + std::vector expected_indices = {offset}; + std::vector expected_values = {value}; + VarlenDecoderTest(value, DT_INT64, {}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_INT64_1D) { + std::vector values = {1}; + long offset = 3; + std::vector expected_indices = {offset, 0}; + std::vector expected_values = values; + VarlenDecoderTest(values, DT_INT64, {-1}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_INT64_2D) { + std::vector> values = {{1}}; + long offset = 3; + std::vector expected_indices = {offset, 0, 0}; + std::vector expected_values = {1}; + VarlenDecoderTest(values, DT_INT64, {-1, -1}, expected_indices, + expected_values, offset); +} + +TEST(VarlenDecoderTest, DT_FLOAT_scalar) { + float value = -0.6; + long offset = 5; + std::vector expected_indices = {offset}; + std::vector expected_values = {value}; + VarlenDecoderTest(value, DT_FLOAT, {}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_FLOAT_1D) { + std::vector values = {}; + long offset = 111; + std::vector expected_indices = {}; + std::vector expected_values = values; + VarlenDecoderTest(values, DT_FLOAT, {-1}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_FLOAT_2D) { + std::vector> values = {{-0.1, -0.2, -0.3}, {-1.4, 5.4}}; + long offset = 111; + std::vector expected_indices = { + offset, 0, 0, offset, 0, 1, offset, 0, 2, offset, 1, 0, offset, 1, 1}; + std::vector expected_values = {-0.1, -0.2, -0.3, -1.4, 5.4}; + VarlenDecoderTest(values, DT_FLOAT, {-1, -1}, expected_indices, + expected_values, offset); +} + +TEST(VarlenDecoderTest, DT_DOUBLE_scalar) { + double value = -0.99; + long offset = 1; + std::vector expected_indices = {offset}; + std::vector expected_values = {value}; + VarlenDecoderTest(value, DT_DOUBLE, {}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_DOUBLE_1D) { + std::vector values = {1.852, 0.79}; + long offset = 3; + std::vector expected_indices = {offset, 0, offset, 1}; + std::vector expected_values = values; + VarlenDecoderTest(values, DT_DOUBLE, {-1}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_DOUBLE_2D) { + std::vector> values = {}; + long offset = 5; + std::vector expected_indices = {}; + std::vector expected_values = {}; + VarlenDecoderTest(values, DT_DOUBLE, {-1, 2}, expected_indices, + expected_values, offset); +} + +TEST(VarlenDecoderTest, DT_STRING_scalar) { + string value = "abc"; + long offset = 7; + std::vector expected_indices = {offset}; + std::vector expected_values = {"abc"}; + VarlenDecoderTest(value, DT_STRING, {}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_BYTES_scalar) { + byte_array value{0xb4, 0x98, 0x1a}; + long offset = 7; + std::vector expected_indices = {offset}; + std::vector expected_values = {value}; + VarlenDecoderTest(value, DT_STRING, {}, expected_indices, expected_values, + offset, avro::AVRO_BYTES); +} + +TEST(VarlenDecoderTest, DT_STRING_1D) { + std::vector values = {"", "", ""}; + long offset = 0; + std::vector expected_indices = {offset, 0, offset, 1, offset, 2}; + std::vector expected_values = values; + VarlenDecoderTest(values, DT_STRING, {-1}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_BYTES_1D) { + byte_array v1{0xb4, 0xaf, 0x98, 0x1a}; + byte_array v2{0xb4, 0xaf, 0x98}; + byte_array v3{0xb4, 0x98, 0x1a}; + std::vector values = {v1, v2, v3}; + long offset = 0; + std::vector expected_indices = {offset, 0, offset, 1, offset, 2}; + std::vector expected_values = values; + VarlenDecoderTest(values, DT_STRING, {-1}, expected_indices, expected_values, + offset, avro::AVRO_BYTES); +} + +TEST(VarlenDecoderTest, DT_STRING_2D) { + std::vector> values = {{"abc"}, {"ABC"}, {"LINKEDIN"}}; + long offset = 0; + std::vector expected_indices = {offset, 0, 0, offset, 1, + 0, offset, 2, 0}; + std::vector expected_values = {"abc", "ABC", "LINKEDIN"}; + VarlenDecoderTest(values, DT_STRING, {-1, 1}, expected_indices, + expected_values, offset); +} + +TEST(VarlenDecoderTest, DT_BYTES_2D) { + byte_array v1{0xb4, 0xaf, 0x98, 0x1a}; + byte_array v2{0xb4, 0xaf, 0x98}; + byte_array v3{0xb4, 0x98, 0x1a}; + std::vector> values = {{v1}, {v2}, {v3}}; + long offset = 0; + std::vector expected_indices = {offset, 0, 0, offset, 1, + 0, offset, 2, 0}; + std::vector expected_values = {v1, v2, v3}; + VarlenDecoderTest(values, DT_STRING, {-1, 1}, expected_indices, + expected_values, offset, avro::AVRO_BYTES); +} + +TEST(VarlenDecoderTest, DT_BOOL_scalar) { + bool value = true; + long offset = 0; + std::vector expected_indices = {offset}; + std::vector expected_values = {value}; + VarlenDecoderTest(value, DT_BOOL, {}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_BOOL_1D) { + std::vector values = {true, false, true}; + long offset = 3; + std::vector expected_indices = {offset, 0, offset, 1, offset, 2}; + std::vector expected_values = values; + VarlenDecoderTest(values, DT_BOOL, {-1}, expected_indices, expected_values, + offset); +} + +TEST(VarlenDecoderTest, DT_BOOL_2D) { + std::vector> values = {{}, {true, true}}; + long offset = 4; + std::vector expected_indices = {offset, 1, 0, offset, 1, 1}; + std::vector expected_values = {true, true}; + VarlenDecoderTest(values, DT_BOOL, {2, -1}, expected_indices, expected_values, + offset); +} + +} // namespace varlen +} // namespace atds +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds_dataset_kernels.cc b/tensorflow_io/core/kernels/avro/atds_dataset_kernels.cc new file mode 100644 index 000000000..906d7bf9b --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds_dataset_kernels.cc @@ -0,0 +1,1210 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow_io/core/kernels/avro/atds_dataset_kernels.h" + +#include +#include +#include +#include + +#include "api/Compiler.hh" +#include "api/DataFile.hh" +#include "api/Decoder.hh" +#include "api/Specific.hh" +#include "api/Stream.hh" +#include "api/ValidSchema.hh" +#include "tensorflow/core/data/name_utils.h" +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/dataset.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_requires.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/io/inputbuffer.h" +#include "tensorflow/core/platform/blocking_counter.h" +#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/strcat.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "tensorflow_io/core/kernels/avro/atds/atds_decoder.h" +#include "tensorflow_io/core/kernels/avro/atds/avro_block_reader.h" +#include "tensorflow_io/core/kernels/avro/atds/decompression_handler.h" +#include "tensorflow_io/core/kernels/avro/atds/errors.h" +#include "tensorflow_io/core/kernels/avro/atds/shuffle_handler.h" + +namespace tensorflow { +namespace data { + +void ParallelFor(const std::function& f, size_t n, + thread::ThreadPool* thread_pool) { + if (n == 0) return; + if (thread_pool == nullptr) { + for (size_t i = 0; i < n; ++i) { + f(i); + } + } else { + BlockingCounter counter(n - 1); + for (size_t i = 1; i < n; ++i) { + thread_pool->Schedule([i, &f, &counter] { + f(i); + counter.DecrementCount(); + }); + } + f(0); + counter.Wait(); + } +} + +/* static */ constexpr const char* const ATDSDatasetOp::kDatasetType; +/* static */ constexpr const char* const ATDSDatasetOp::kFileNames; +/* static */ constexpr const char* const ATDSDatasetOp::kBatchSize; +/* static */ constexpr const char* const ATDSDatasetOp::kDropRemainder; +/* static */ constexpr const char* const ATDSDatasetOp::kReaderBufferSize; +/* static */ constexpr const char* const ATDSDatasetOp::kShuffleBufferSize; +/* static */ constexpr const char* const ATDSDatasetOp::kNumParallelCalls; +/* static */ constexpr const char* const ATDSDatasetOp::kFeatureKeys; +/* static */ constexpr const char* const ATDSDatasetOp::kFeatureTypes; +/* static */ constexpr const char* const ATDSDatasetOp::kSparseDtypes; +/* static */ constexpr const char* const ATDSDatasetOp::kSparseShapes; +/* static */ constexpr const char* const ATDSDatasetOp::kOutputDtypes; +/* static */ constexpr const char* const ATDSDatasetOp::kOutputShapes; +/* static */ constexpr const char* const ATDSDatasetOp::kDenseType; +/* static */ constexpr const char* const ATDSDatasetOp::kSparseType; +/* static */ constexpr const char* const ATDSDatasetOp::kVarlenType; + +class ATDSDatasetOp::Dataset : public DatasetBase { + public: + explicit Dataset(OpKernelContext* ctx, std::vector filenames, + size_t batch_size, bool drop_remainder, + int64 reader_buffer_size, int64 shuffle_buffer_size, + int64 num_parallel_calls, + const std::vector& feature_keys, + const std::vector& feature_types, + const std::vector& sparse_dtypes, + const std::vector& sparse_shapes, + const std::vector& output_dtypes, + const std::vector& output_shapes) + : DatasetBase(DatasetContext(ctx)), + filenames_(std::move(filenames)), + batch_size_(batch_size), + reader_buffer_size_(reader_buffer_size), + shuffle_buffer_size_(shuffle_buffer_size), + num_parallel_calls_(num_parallel_calls), + drop_remainder_(drop_remainder), + feature_keys_(feature_keys), + feature_types_(feature_types), + sparse_dtypes_(sparse_dtypes), + sparse_shapes_(sparse_shapes), + output_dtypes_(output_dtypes), + output_shapes_(output_shapes) { + size_t num_of_features = feature_keys_.size(); + output_tensor_types_.reserve(num_of_features); + sparse_value_index_.reserve(sparse_dtypes.size()); + for (size_t i = 0; i < num_of_features; i++) { + if (feature_types[i] == kDenseType) { + output_tensor_types_.emplace_back(TensorType::dense); + auto dim_v = output_shapes[i].dim_sizes(); + size_t rank = dim_v.size(); + + TensorShapeProto proto; + PartialTensorShape shape; + for (size_t d = 1; d < rank; d++) { + proto.add_dim()->set_size(dim_v[d]); + } + if (!PartialTensorShape::BuildPartialTensorShape(proto, &shape).ok()) { + LOG(ERROR) << "Error encountered in creating PartialTensorShape for " + "dense features."; + } + dense_features_.emplace_back(atds::FeatureType::dense, feature_keys_[i], + output_dtypes[i], shape, num_of_dense_); + num_of_dense_++; + } else if (feature_types[i] == kSparseType || + feature_types[i] == kVarlenType) { + output_tensor_types_.emplace_back(TensorType::sparse); + + auto& shape = sparse_shapes[num_of_sparse_]; + // The estimated number of elements in this sparse tensor. + // The estimated number is used to preallocate sparse value buffer. + size_t estimated_elements = 1; + if (feature_types[i] == kVarlenType) { + for (auto dim : shape) { + // Assume unknown dim will only have 1 element. For example, + // varlen tensor with shape [-1, 2, -1] is expected to have 2 + // elements in total. + if (dim.size > 0) { + estimated_elements *= dim.size; + } + } + } + size_t rank_after_batch = static_cast(shape.dims() + 1); + sparse_expected_elements_.indices.push_back(rank_after_batch * + estimated_elements); + + size_t values_index = 0; + auto dtype = sparse_dtypes[num_of_sparse_]; + if (dtype == DT_INT32) { + values_index = sparse_dtype_counts_.int_counts++; + sparse_expected_elements_.int_values.push_back(estimated_elements); + } else if (dtype == DT_INT64) { + values_index = sparse_dtype_counts_.long_counts++; + sparse_expected_elements_.long_values.push_back(estimated_elements); + } else if (dtype == DT_FLOAT) { + values_index = sparse_dtype_counts_.float_counts++; + sparse_expected_elements_.float_values.push_back(estimated_elements); + } else if (dtype == DT_DOUBLE) { + values_index = sparse_dtype_counts_.double_counts++; + sparse_expected_elements_.double_values.push_back(estimated_elements); + } else if (dtype == DT_STRING) { + values_index = sparse_dtype_counts_.string_counts++; + sparse_expected_elements_.string_values.push_back(estimated_elements); + } else if (dtype == DT_BOOL) { + values_index = sparse_dtype_counts_.bool_counts++; + sparse_expected_elements_.bool_values.push_back(estimated_elements); + } + sparse_value_index_.emplace_back(values_index); + + if (feature_types[i] == kSparseType) { + sparse_features_.emplace_back( + atds::FeatureType::sparse, feature_keys_[i], + sparse_dtypes[num_of_sparse_], sparse_shapes[num_of_sparse_], + num_of_sparse_, values_index); + } else if (feature_types[i] == kVarlenType) { + varlen_features_.emplace_back( + atds::FeatureType::varlen, feature_keys_[i], + sparse_dtypes[num_of_sparse_], sparse_shapes[num_of_sparse_], + num_of_sparse_, values_index); + } + num_of_sparse_++; + } else { + LOG(ERROR) << "Unknown feature type " << feature_types[i]; + } + } + + for (auto& dtype : output_dtypes) { + output_dtype_vector_.push_back(dtype); + } + } + + std::unique_ptr MakeIteratorInternal( + const string& prefix) const override { + return absl::make_unique(Iterator::Params{ + this, name_utils::IteratorPrefix(kDatasetType, prefix)}); + } + + const DataTypeVector& output_dtypes() const override { + return output_dtype_vector_; + } + + const std::vector& output_shapes() const override { + return output_shapes_; + } + + string DebugString() const override { + return name_utils::DatasetDebugString(kDatasetType); + } + + Status InputDatasets(std::vector* inputs) const override { + return OkStatus(); + } + + Status CheckExternalState() const override { return OkStatus(); } + + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + Node* filenames = nullptr; + TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames)); + Node* batch_size = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size)); + Node* drop_remainder = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder)); + Node* reader_buffer_size = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(reader_buffer_size_, &reader_buffer_size)); + Node* shuffle_buffer_size = nullptr; + TF_RETURN_IF_ERROR( + b->AddScalar(shuffle_buffer_size_, &shuffle_buffer_size)); + Node* num_parallel_calls = nullptr; + TF_RETURN_IF_ERROR(b->AddScalar(num_parallel_calls_, &num_parallel_calls)); + + AttrValue feature_keys; + b->BuildAttrValue(feature_keys_, &feature_keys); + AttrValue feature_types; + b->BuildAttrValue(feature_types_, &feature_types); + AttrValue sparse_dtypes; + b->BuildAttrValue(sparse_dtypes_, &sparse_dtypes); + AttrValue sparse_shapes; + b->BuildAttrValue(sparse_shapes_, &sparse_shapes); + AttrValue output_dtypes; + b->BuildAttrValue(output_dtypes_, &output_dtypes); + AttrValue output_shapes; + b->BuildAttrValue(output_shapes_, &output_shapes); + + TF_RETURN_IF_ERROR(b->AddDataset( + this, + {filenames, batch_size, drop_remainder, reader_buffer_size, + shuffle_buffer_size, num_parallel_calls}, + {{kFeatureKeys, feature_keys}, + {kFeatureTypes, feature_types}, + {kSparseDtypes, sparse_dtypes}, + {kSparseShapes, sparse_shapes}, + {kOutputDtypes, output_dtypes}, + {kOutputShapes, output_shapes}}, + output)); + return OkStatus(); + } + + private: + enum class TensorType { dense, sparse }; + + /** + * Utility struct to collect the number of sparse tensors for each DType. + */ + struct SparseDtypeCounts { + size_t int_counts = 0; + size_t long_counts = 0; + size_t float_counts = 0; + size_t double_counts = 0; + size_t string_counts = 0; + size_t bool_counts = 0; + }; + + /** + * Utility struct to store the estimated number of elements for each sparse + * tensor. The estimated number in values tensor and indices tensor are + * ordered based on the layout in atds::sparse::ValueBuffer. + * The information is used for better buffer pre-allocation. + */ + struct SparseExpectedElements { + std::vector int_values; + std::vector long_values; + std::vector float_values; + std::vector double_values; + std::vector string_values; + std::vector bool_values; + std::vector indices; + }; + + class Iterator : public DatasetIterator { + public: + static constexpr const char* const kWaitingForData = "WaitingForData"; + static constexpr const char* const kBlockReading = "BlockReading"; + static constexpr const char* const kParsingThread = "ParsingThread_"; + static constexpr const char* const kDeflateDecompression = + "DeflateDecompression"; + static constexpr const char* const kSnappyDecompression = + "SnappyDecompression"; + static constexpr const char* const kFillingSparseValues = + "FillingSparseValues"; + + explicit Iterator(const Params& params) + : DatasetIterator(params), + shuffle_handler_(nullptr), + cond_var_(std::make_shared()), + write_var_(std::make_shared()), + mu_(std::make_shared()), + count_(0) { + batch_size_ = static_cast(dataset()->batch_size_); + shuffle_buffer_size_ = + static_cast(dataset()->shuffle_buffer_size_); + shuffle_handler_ = std::make_unique(mu_.get()); + decompression_handler_ = std::make_unique(); + auto& sparse_dtype_counts = dataset()->sparse_dtype_counts_; + value_buffer_.int_values.resize(sparse_dtype_counts.int_counts); + value_buffer_.long_values.resize(sparse_dtype_counts.long_counts); + value_buffer_.float_values.resize(sparse_dtype_counts.float_counts); + value_buffer_.double_values.resize(sparse_dtype_counts.double_counts); + value_buffer_.string_values.resize(sparse_dtype_counts.string_counts); + value_buffer_.bool_values.resize(sparse_dtype_counts.bool_counts); + value_buffer_.num_of_elements.resize(dataset()->num_of_sparse_); + value_buffer_.indices.resize(dataset()->num_of_sparse_); + } + + ~Iterator() override { + // must ensure that the thread is cancelled. + CancelThreads(); + // LOG(INFO) << "Decompression time per record (us): " << + // (static_cast(GetTotalStats(total_decompress_micros_)) / + // GetTotalStats(num_decompressed_objects_)); LOG(INFO) << "Decode time + // per record (us): " << + // (static_cast(GetTotalStats(total_decode_micros_)) / + // GetTotalStats(total_records_parsed_)); + } + + void CancelThreads() TF_LOCKS_EXCLUDED(mu_) { + mutex_lock l(*mu_); + mutex_lock i(input_mu_); + cancelled_ = true; + cond_var_->notify_all(); + write_var_->notify_all(); + // wait for thread to finish + if (prefetch_thread_) { + while (!prefetch_thread_finished_) { + write_var_->wait(i); + } + } + } + + Status Initialize(IteratorContext* ctx) { + int64 num_threads = dataset()->num_parallel_calls_; + const int64 max_parallelism = port::MaxParallelism(); + if (num_threads <= 0 || num_threads > max_parallelism) { + if (num_threads == tensorflow::data::model::kAutotune) { + LOG(INFO) << "Thread autotuning enabled for " + "ATDSDatasetOp::Dataset::Iterator."; + } + LOG(INFO) << "Create ATDSDatasetOp::Dataset::Iterator thread pool with " + << "the maximum parallelism number " << max_parallelism + << " for this process."; + num_threads = max_parallelism; + } + thread_delays.resize(max_parallelism, 0); + thread_itrs.resize(max_parallelism, 0); + thread_pool_ = + ctx->CreateThreadPool(std::string(kDatasetType), num_threads); + return OkStatus(); + } + + Status GetNextInternal(IteratorContext* ctx, + std::vector* out_tensors, + bool* end_of_sequence) override { + mutex_lock l(*mu_); + EnsurePrefetchThreadStarted(ctx); + size_t total_buffer = total_buffer_size(); + while (true) { + // LOG(INFO) << "b " << blocks_.size() << " c_: " << count_; + // while count_ is smaller than batch_size, wait on cond_var_ if not + // last file this will get woken up by the prefetch thread + size_t count = 0; + bool prefetch_thread_finished = false; + { + tensorflow::profiler::TraceMe trace(kWaitingForData); + + mutex_lock i(input_mu_); + while (!cancelled_ && !prefetch_thread_finished_ && + count_ < total_buffer) { + // LOG(INFO) << "waiting on block refill " << blocks_.size() << " + // count: " << count_; + write_var_->notify_all(); + cond_var_->wait(i); + } + // LOG(INFO) << "done waiting on block refill " << blocks_.size() << " + // count: " << count_; + if (cancelled_) { + return OkStatus(); + } + + count_ = 0; + // merge write_blocks_ into blocks_ + blocks_.reserve(blocks_.size() + write_blocks_.size()); + blocks_.insert(blocks_.end(), + std::make_move_iterator(write_blocks_.begin()), + std::make_move_iterator(write_blocks_.end())); + write_blocks_.clear(); // size down the write_blocks + + size_t non_empty_idx = 0; + for (size_t i = 0; i < blocks_.size(); i++) { + count_ += blocks_[i]->object_count - blocks_[i]->num_decoded; + if (blocks_[i]->num_decoded < blocks_[i]->object_count) { + std::swap(blocks_[non_empty_idx], blocks_[i]); + non_empty_idx++; + } + } + blocks_.resize(non_empty_idx); + + count = count_; + prefetch_thread_finished = prefetch_thread_finished_; + + // let it continue to read batch_size_ or count_ records. + count_ -= std::min(count_, batch_size_); + write_var_->notify_all(); + + if (prefetch_thread_finished_) { + // Finished epoch, reset shuffle for new epoch + shuffle_handler_->ResetRngs(); + } + } + + bool drop_remainder = dataset()->drop_remainder_; + if (count >= batch_size_ || + (!drop_remainder && prefetch_thread_finished && count > 0)) { + // LOG(INFO) << "Process " << blocks_.size() << " blocks with " << + // count << " objects. " << non_empty_idx << " batch: " << + // batch_size_; + size_t batch_size = std::min(count, batch_size_); + PartialTensorShape batch_dim({static_cast(batch_size)}); + auto num_of_dense = dataset()->num_of_dense_; + auto num_of_sparse = dataset()->num_of_sparse_; + auto& dense_features = dataset()->dense_features_; + std::vector dense_tensors; + for (size_t i = 0; i < num_of_dense; i++) { + auto& dense_feature = dense_features[i]; + TensorShape shape; + batch_dim.Concatenate(dense_feature.shape).AsTensorShape(&shape); + dense_tensors.emplace_back(ctx->allocator({}), dense_feature.dtype, + shape); + } + + size_t thread_pool_size = + static_cast(thread_pool_->NumThreads()); + size_t num_blocks = blocks_.size(); + size_t num_threads = std::min(num_blocks, thread_pool_size); + num_threads = std::min(num_threads, + static_cast(port::MaxParallelism())); + + int64 user_defined_thread_num = dataset()->num_parallel_calls_; + if (user_defined_thread_num > 0) { + num_threads = std::min( + num_threads, static_cast(user_defined_thread_num)); + } else if (user_defined_thread_num == + tensorflow::data::model::kAutotune) { + num_threads = ComputeNumAutotuneThreads(num_threads); + } + total_records_parsed_.resize(num_threads, 0); + total_decode_micros_.resize(num_threads, 0); + num_decompressed_objects_.resize(num_threads, 0); + total_decompress_micros_.resize(num_threads, 0); + shuffle_handler_->SampleBlocks(batch_size, shuffle_buffer_size_ > 0, + blocks_); + std::vector sparse_buffer(num_threads, + value_buffer_); + + std::vector status_of_threads(num_threads); + auto process_block = [&](size_t i, size_t thread_idx, + avro::DecoderPtr& decoder, + atds::sparse::ValueBuffer& buffer, + std::vector& skipped) { + // start is the offset in the each example, and therefore just need + // to be different from every other block. + size_t start = 0; + if (i > 0) { + start += blocks_[i - 1]->counts; + } + size_t end = blocks_[i]->counts; + // LOG(INFO) << "Block: " << i << " start: " << start << " end: " << + // end << " read_so_far " << blocks_[i]->num_decoded + // << " num_to_decode: " << blocks_[i]->num_to_decode << " + // remaining: " << (blocks_[i]->object_count - + // blocks_[i]->num_decoded); + avro::Codec codec = blocks_[i]->codec; + avro::InputStreamPtr input_stream = nullptr; + uint64 decompress_start_time = ctx->env()->NowMicros(); + if (codec == avro::NULL_CODEC) { + input_stream = + decompression_handler_->decompressNullCodec(*(blocks_[i])); + } else if (codec == avro::DEFLATE_CODEC) { + tensorflow::profiler::TraceMe traceme(kDeflateDecompression); + input_stream = + decompression_handler_->decompressDeflateCodec(*(blocks_[i])); + } +#ifdef SNAPPY_CODEC_AVAILABLE + else if (codec == avro::SNAPPY_CODEC) { + tensorflow::profiler::TraceMe traceme(kSnappyDecompression); + input_stream = + decompression_handler_->decompressSnappyCodec(*(blocks_[i])); + } +#endif + else { + throw avro::Exception( + "Unsupported Avro codec. Only null or deflate is supported. " + "Got " + + codec); + } + uint64 decompress_end_time = ctx->env()->NowMicros(); + if (codec != avro::NULL_CODEC) { + total_decompress_micros_[thread_idx] += + (decompress_end_time - decompress_start_time); + num_decompressed_objects_[thread_idx] += blocks_[i]->object_count; + // LOG(INFO) << "Block " << i << " decompress time (us): " << + // (decompress_end_time - decompress_start_time) + // << ", num records: " << blocks_[i]->object_count; + } + decoder->init(*input_stream); + + while (start < end) { + // LOG(INFO) << "Block: " << i << " start: " << start; + uint64 datum_parse_start = ctx->env()->NowMicros(); + auto decoding_status = atds_decoder_->DecodeATDSDatum( + decoder, dense_tensors, buffer, skipped, start); + if (!decoding_status.ok()) { + // The decoding of this block has failed, + // setting the number of decoded objects to the total number of + // objects in the block so the decoder will skip decoding this + // block. + blocks_[i]->num_decoded = blocks_[i]->object_count; + return decoding_status; + } + uint64 datum_parse_end = ctx->env()->NowMicros(); + total_decode_micros_[thread_idx] += + (datum_parse_end - datum_parse_start); + total_records_parsed_[thread_idx] += 1; + start++; + blocks_[i]->num_decoded++; + blocks_[i]->num_to_decode--; + } + + if (blocks_[i]->object_count > blocks_[i]->num_decoded) { + decoder->init(*input_stream); + blocks_[i]->read_offset += input_stream->byteCount(); + // LOG(INFO) << "Block: " << i << " Reset offset to " << + // blocks_[i]->read_offset << ". " << (end - start) + // << " datum left for block " << i; + } + // LOG(INFO) << "process block " << i << " . Read: " << + // blocks_[i]->num_decoded; + return OkStatus(); + }; + + std::vector block_nums; + GetBlockRanges(num_threads, block_nums); + std::vector thread_start_times; + thread_start_times.resize(num_threads, 0); + auto process = [&](size_t index) { + auto parsing_thread_name = [index]() { + return strings::StrCat(kParsingThread, index); + }; + tensorflow::profiler::TraceMe trace(parsing_thread_name); + + thread_start_times[index] = ctx->env()->NowMicros(); + size_t block_start = 0; + if (index > 0) { + block_start = block_nums[index - 1]; + } + size_t block_end = block_nums[index]; + auto decoder = avro::binaryDecoder(); + auto skipped = atds_decoder_->GetSkippedData(); + auto& buffer = sparse_buffer[index]; + size_t count_start = 0; + if (block_start > 0) { + count_start = blocks_[block_start - 1]->counts; + } + size_t num_of_datum = blocks_[block_end - 1]->counts - count_start; + InitSparseValueBuffer(buffer, num_of_datum); + // LOG(INFO) << "Thread " << index << " process blocks from " << + // block_start << " to " + // << block_end << " with " << num_of_datum << " + // examples."; + + status_of_threads[index] = OkStatus(); + auto& status = status_of_threads[index]; + + for (size_t i = block_start; i < block_end && status.ok(); i++) { + if (blocks_[i]->codec != avro::NULL_CODEC || + blocks_[i]->num_to_decode > 0) { + status = process_block(i, index, decoder, buffer, skipped); + } + } + // LOG(INFO) << "Thread " << index << " process blocks from " << + // block_start << " to " << block_end << ". Done."; + }; + ParallelFor(process, num_threads, thread_pool_.get()); + uint64 earliest_start_time = *std::min_element( + thread_start_times.begin(), thread_start_times.end()); + for (size_t i = 0; i < num_threads; i++) { + thread_delays[i] += (thread_start_times[i] - earliest_start_time); + thread_itrs[i] += 1; + } + for (Status& status : status_of_threads) { + TF_RETURN_IF_ERROR(status); + } + + std::vector num_of_elements(num_of_sparse, 0); + std::vector indices_tensors; + std::vector values_tensors; + std::vector shape_tensors; + indices_tensors.reserve(num_of_sparse); + values_tensors.reserve(num_of_sparse); + shape_tensors.reserve(num_of_sparse); + auto& sparse_dtypes = dataset()->sparse_dtypes_; + auto& sparse_shapes = dataset()->sparse_shapes_; + for (size_t i = 0; i < num_of_sparse; i++) { + for (size_t t = 0; t < num_threads; t++) { + // Check if vector is empty and move on to the next vector. + // If shuffle buffer and number of threads is large compared + // to the batch, this vector maybe empty for certain threads. + num_of_elements[i] += static_cast( + GetLastElement(sparse_buffer[t].num_of_elements[i])); + } + auto& sparse_shape = sparse_shapes[i]; + + int64 rank = sparse_shape.dims() + 1; + TensorShape indices_shape({num_of_elements[i], rank}); + TensorShape values_shape({num_of_elements[i]}); + TensorShape shape_shape({rank}); + indices_tensors.emplace_back(DT_INT64, indices_shape); + values_tensors.emplace_back(sparse_dtypes[i], values_shape); + shape_tensors.emplace_back(DT_INT64, shape_shape); + + auto& shape_tensor = shape_tensors.back(); + size_t d = 0; + shape_tensor.vec()(d++) = batch_size; + for (auto dim : sparse_shape) { + if (dim.size > 0) { + shape_tensor.vec()(d++) = dim.size; + } else { + // When dim size is unknown i.e. -1, scan indices array to find + // the largest dim value. + long max_dim = -1; + for (size_t t = 0; t < num_threads; t++) { + auto& indices = sparse_buffer[t].indices[i]; + for (size_t pos = d; pos < indices.size(); pos += rank) { + max_dim = std::max(max_dim, indices[pos]); + } + } + shape_tensor.vec()(d++) = max_dim + 1; + } + } + } + + auto& sparse_value_index = dataset()->sparse_value_index_; + auto fill_sparse_value = [&](int64 thread_index) { + // LOG(INFO) << "Thread " << thread_index << " starts filling sparse + // value"; + auto& buffer = sparse_buffer[thread_index]; + for (size_t i = 0; i < num_of_sparse; i++) { + size_t offset = 0; + int64 index = thread_index; + while (index > 0) { + index--; + offset += + GetLastElement(sparse_buffer[index].num_of_elements[i]); + } + + size_t rank_after_batch = + static_cast(sparse_shapes[i].dims() + 1); + atds::sparse::FillIndicesTensor(buffer.indices[i], + indices_tensors[i], + rank_after_batch * offset); + atds::sparse::FillValuesTensor(buffer, values_tensors[i], + sparse_dtypes[i], + sparse_value_index[i], offset); + // LOG(INFO) << "Thread " << thread_index << " filled sparse + // values."; + } + }; + + { + tensorflow::profiler::TraceMe trace(kFillingSparseValues); + ParallelFor(fill_sparse_value, num_threads, thread_pool_.get()); + } + + size_t feature_num = num_of_dense + num_of_sparse; + size_t dense_index = 0, sparse_index = 0; + auto& feature_types = dataset()->output_tensor_types_; + for (size_t i = 0; i < feature_num; i++) { + if (feature_types[i] == TensorType::dense) { + out_tensors->emplace_back( + std::move(dense_tensors[dense_index++])); + } else if (feature_types[i] == TensorType::sparse) { + out_tensors->emplace_back(DT_VARIANT, TensorShape({3})); + auto& serialized_sparse_t = out_tensors->back(); + serialized_sparse_t.vec()(0) = + std::move(indices_tensors[sparse_index]); + serialized_sparse_t.vec()(1) = + std::move(values_tensors[sparse_index]); + serialized_sparse_t.vec()(2) = + std::move(shape_tensors[sparse_index]); + sparse_index++; + } + } + // LOG(INFO) << "Done with batch " ; + *end_of_sequence = false; + return OkStatus(); + } else { + *end_of_sequence = true; + return prefetch_thread_status_; + } + } + } + + protected: + std::shared_ptr CreateNode( + IteratorContext* ctx, model::Node::Args args) const override { + return model::MakeSourceNode(std::move(args)); + } + + Status SaveInternal(SerializationContext* ctx, + IteratorStateWriter* writer) override { + return errors::Unimplemented("SaveInternal"); + } + + Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) override { + return errors::Unimplemented( + "Iterator does not support 'RestoreInternal')"); + } + + private: + // Returns the last element of the provided integer vector is a null-safe + // fashion + size_t GetLastElement(const std::vector& num_of_elements_at_i) { + if (num_of_elements_at_i.empty()) { + return 0; + } + return num_of_elements_at_i.back(); + } + + void PrefetchThread(const std::shared_ptr& ctx) { + size_t total_buffer = total_buffer_size(); + std::unique_ptr reader; + std::unique_ptr file; + size_t current_file_index = 0; + while (true) { + // 1. wait for a slot in the buffer + { + mutex_lock l(input_mu_); + while (!cancelled_ && count_ >= total_buffer) { + // LOG(INFO) << "prefetch waiting on block size " << blocks_.size() + // << " count: " << count_; + cond_var_->notify_one(); + write_var_->wait(l); + } + // LOG(INFO) << "prefetch done waiting on block size " << + // blocks_.size() << " count: " << count_; + if (cancelled_) { + prefetch_thread_finished_ = true; + prefetch_thread_status_ = OkStatus(); + cond_var_->notify_all(); + write_var_->notify_all(); + return; + } + } // done with mutex_lock l + // 2. read the next elements unil count hits max + Status status = OkStatus(); + if (!reader) { + status = + SetupStreamsLocked(ctx->env(), file, reader, current_file_index); + if (!status.ok()) { + mutex_lock l(input_mu_); + LOG(ERROR) << "Error loading file: " + << dataset()->filenames_[current_file_index]; + prefetch_thread_finished_ = true; + prefetch_thread_status_ = status; + cond_var_->notify_all(); + write_var_->notify_all(); + return; + } + } + + // LOG(INFO) << "Before processing " << count_ << " datum left in + // block."; + tensorflow::profiler::TraceMe trace(kBlockReading); + + auto block = std::make_unique(); + status = reader->ReadBlock(*block); + // LOG(INFO) << "Read block status: " << status.ToString(); + // done with mutex_lock input_l + if (!status.ok()) { + if (!errors::IsOutOfRange(status)) { + LOG(ERROR) << "Error in reading avro block. Cause: " + << status.ToString(); + } + // LOG(INFO) << "Resetting stream: " << status.ToString() << "b " << + // blocks_.size() << " c_: " << count_; + ResetStreamsLocked(file, reader); + ++current_file_index; + if (current_file_index >= dataset()->filenames_.size()) { + mutex_lock l(input_mu_); + prefetch_thread_finished_ = true; + // Note: this is overwriting any previous errors + prefetch_thread_status_ = OkStatus(); + cond_var_->notify_all(); + write_var_->notify_all(); + return; + } // done with mutex_lock l + } else { + mutex_lock n(input_mu_); + count_ += block->object_count; + write_blocks_.emplace_back(std::move(block)); + ++num_blocks_read_; + } + } // end while + } + + Status EnsurePrefetchThreadStarted(IteratorContext* ctx) + TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) { + if (!prefetch_thread_) { + std::shared_ptr new_ctx = + std::make_shared(*ctx); + prefetch_thread_ = + ctx->StartThread("atds_data_prefetch", + [this, new_ctx]() { PrefetchThread(new_ctx); }); + } + return OkStatus(); + } + + size_t total_buffer_size() { return batch_size_ + shuffle_buffer_size_; } + + // Sets up reader streams to read from the file at `current_file_index_`. + Status SetupStreamsLocked( + Env* env, std::unique_ptr& file, + std::unique_ptr& reader, size_t current_file_index) { + if (current_file_index >= dataset()->filenames_.size()) { + return errors::InvalidArgument( + "current_file_index_:", current_file_index, + " >= filenames_.size():", dataset()->filenames_.size()); + } + + // Actually move on to next file. + const string& next_filename = dataset()->filenames_[current_file_index]; + TF_RETURN_IF_ERROR(env->NewRandomAccessFile(next_filename, &file)); + reader = absl::make_unique( + file.get(), dataset()->reader_buffer_size_); + if (atds_decoder_ == nullptr) { + atds_decoder_ = std::make_unique( + dataset()->dense_features_, dataset()->sparse_features_, + dataset()->varlen_features_); + TF_RETURN_IF_ERROR(atds_decoder_->Initialize(reader->GetSchema())); + expected_schema_ = atds_decoder_->GetSchema().toJson(false); + } else if (expected_schema_ != reader->GetSchema().toJson(false)) { + string expected_schema = atds_decoder_->GetSchema().toJson(true); + string varied_schema = reader->GetSchema().toJson(true); + string filename = dataset()->filenames_[0]; + return atds::VariedSchemaNotSupportedError( + expected_schema, filename, varied_schema, next_filename); + } + return OkStatus(); + } + + // Resets all reader streams. + void ResetStreamsLocked(std::unique_ptr& file, + std::unique_ptr& reader) { + reader.reset(); + file.reset(); + } + + void InitSparseValueBuffer(atds::sparse::ValueBuffer& buffer, + size_t num_of_datum) { + auto& sparse_dtype_counts = dataset()->sparse_dtype_counts_; + auto& sparse_expected_elements = dataset()->sparse_expected_elements_; + for (size_t i = 0; i < sparse_dtype_counts.int_counts; i++) { + buffer.int_values[i].reserve(num_of_datum * + sparse_expected_elements.int_values[i]); + } + for (size_t i = 0; i < sparse_dtype_counts.long_counts; i++) { + buffer.long_values[i].reserve(num_of_datum * + sparse_expected_elements.long_values[i]); + } + for (size_t i = 0; i < sparse_dtype_counts.float_counts; i++) { + buffer.float_values[i].reserve( + num_of_datum * sparse_expected_elements.float_values[i]); + } + for (size_t i = 0; i < sparse_dtype_counts.double_counts; i++) { + buffer.double_values[i].reserve( + num_of_datum * sparse_expected_elements.double_values[i]); + } + for (size_t i = 0; i < sparse_dtype_counts.string_counts; i++) { + buffer.string_values[i].reserve( + num_of_datum * sparse_expected_elements.string_values[i]); + } + for (size_t i = 0; i < sparse_dtype_counts.bool_counts; i++) { + buffer.bool_values[i].reserve(num_of_datum * + sparse_expected_elements.bool_values[i]); + } + + size_t num_of_sparse = dataset()->num_of_sparse_; + for (size_t i = 0; i < num_of_sparse; i++) { + buffer.num_of_elements[i].reserve(num_of_datum); + buffer.indices[i].reserve(num_of_datum * + sparse_expected_elements.indices[i]); + } + } + + void GetUniformBlockRanges(size_t num_threads, + std::vector& block_nums) { + size_t num_blocks = blocks_.size(); + size_t blocks_per_thread = num_blocks / num_threads; + size_t remainder = num_blocks % num_threads; + size_t block_idx = 0; + for (size_t i = 0; i < num_threads; i++) { + block_idx += blocks_per_thread; + if (i < remainder) { + block_idx += 1; + } + block_nums.emplace_back(block_idx); + } + } + + double GetTotalCost(double& decode_cost_per_record, + double& decompress_cost_per_record) { + decode_cost_per_record = + static_cast(GetTotalStats(total_decode_micros_)) / + GetTotalStats(total_records_parsed_); + decompress_cost_per_record = 0; + double total_cost = decode_cost_per_record * batch_size_; + if (GetTotalStats(num_decompressed_objects_) > 0) { + decompress_cost_per_record = + static_cast(GetTotalStats(total_decompress_micros_)) / + GetTotalStats(num_decompressed_objects_); + // Newly read blocks are appended to the end of blocks_ array, and all + // non-newly read blocks were already decompressed in previous + // GetNextInternal iterations. So we loop through blocks in reverse + // order, and terminate when we encounter an already decompressed block + // (null codec). + for (size_t i = blocks_.size(); + i > 0 && blocks_[i - 1]->codec != avro::NULL_CODEC; i--) { + total_cost += + (decompress_cost_per_record * blocks_[i - 1]->object_count); + } + } + return total_cost; + } + + void GetCostBasedBlockRanges(size_t num_threads, + std::vector& block_nums) { + size_t num_blocks = blocks_.size(); + double decode_cost_per_record; + double decompress_cost_per_record; + double total_cost = + GetTotalCost(decode_cost_per_record, decompress_cost_per_record); + double cost_per_thread = total_cost / num_threads; + size_t block_idx = 0; + size_t thread_idx = 0; + double running_cost = 0; + while (thread_idx < num_threads) { + while (running_cost < cost_per_thread * (thread_idx + 1) && + block_idx < num_blocks) { + if (blocks_[block_idx]->codec != avro::NULL_CODEC) { + running_cost += + decompress_cost_per_record * blocks_[block_idx]->object_count; + } + running_cost += + decode_cost_per_record * blocks_[block_idx]->num_to_decode; + block_idx++; + } + block_nums.emplace_back(block_idx); + thread_idx++; + } + block_nums[num_threads - 1] = num_blocks; + } + + void GetBlockRanges(size_t num_threads, std::vector& block_nums) { + block_nums.reserve(num_threads); + if (GetTotalStats(total_decode_micros_) == 0) { + // No decode time statistics yet. Divide blocks evenly between threads + GetUniformBlockRanges(num_threads, block_nums); + } else { + // Get block ranges per thread based on runtime data + GetCostBasedBlockRanges(num_threads, block_nums); + } + } + + size_t ComputeNumAutotuneThreads(size_t curr_threads) { + size_t ideal_num_threads = curr_threads; + if (thread_itrs[0] > 0) { + double decode_cost_per_record; + double decompress_cost_per_record; + double total_cost = + GetTotalCost(decode_cost_per_record, decompress_cost_per_record); + double min_cost = std::numeric_limits::max(); + for (size_t i = 1; i < curr_threads; i++) { + // Compute cost when using `i` threads + double cost_per_thread = total_cost / i; + double max_thread_delay = 0; + for (size_t j = 0; j < i; j++) { + double thread_delay = 0; + if (thread_itrs[j] > 0) { + thread_delay = thread_delays[j] / thread_itrs[j]; + } + max_thread_delay = std::max(thread_delay, max_thread_delay); + } + if (cost_per_thread + max_thread_delay < min_cost) { + min_cost = cost_per_thread + max_thread_delay; + ideal_num_threads = i; + } + } + } + return ideal_num_threads; + } + + uint64 GetTotalStats(std::vector& vec) { + return std::accumulate(vec.begin(), vec.end(), 0); + } + + std::unique_ptr shuffle_handler_ = nullptr; + std::unique_ptr decompression_handler_ = nullptr; + const std::shared_ptr cond_var_ = nullptr; + const std::shared_ptr write_var_ = nullptr; + size_t batch_size_; + size_t shuffle_buffer_size_; + + atds::sparse::ValueBuffer value_buffer_; + std::unique_ptr thread_pool_ = nullptr; + + const std::shared_ptr mu_; + std::unique_ptr prefetch_thread_ TF_GUARDED_BY(*mu_); + std::vector > blocks_ TF_GUARDED_BY(*mu_); + + mutex input_mu_ TF_ACQUIRED_BEFORE(*mu_); + size_t count_ TF_GUARDED_BY(input_mu_) = 0; + bool cancelled_ TF_GUARDED_BY(input_mu_) = false; + bool prefetch_thread_finished_ TF_GUARDED_BY(input_mu_) = false; + Status prefetch_thread_status_ TF_GUARDED_BY(input_mu_); + uint64 num_blocks_read_ TF_GUARDED_BY(input_mu_) = 0; + std::vector > write_blocks_ + TF_GUARDED_BY(input_mu_); + + std::unique_ptr atds_decoder_ = nullptr; + string expected_schema_ = ""; + std::vector total_records_parsed_ TF_GUARDED_BY(*mu_); + std::vector total_decode_micros_ TF_GUARDED_BY(*mu_); + std::vector num_decompressed_objects_ TF_GUARDED_BY(*mu_); + std::vector total_decompress_micros_ TF_GUARDED_BY(*mu_); + std::vector thread_delays TF_GUARDED_BY(*mu_); + std::vector thread_itrs TF_GUARDED_BY(*mu_); + }; + + const std::vector filenames_; + const int64 batch_size_, reader_buffer_size_, shuffle_buffer_size_, + num_parallel_calls_; + const bool drop_remainder_; + const std::vector feature_keys_, feature_types_; + const std::vector sparse_dtypes_; + const std::vector sparse_shapes_; + const std::vector output_dtypes_; + const std::vector output_shapes_; + std::vector sparse_value_index_; + DataTypeVector output_dtype_vector_; + + std::vector output_tensor_types_; + + std::vector dense_features_; + std::vector sparse_features_; + std::vector varlen_features_; + SparseDtypeCounts sparse_dtype_counts_; + SparseExpectedElements sparse_expected_elements_; + size_t num_of_dense_ = 0, num_of_sparse_ = 0; +}; + +ATDSDatasetOp::ATDSDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr(kFeatureKeys, &feature_keys_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr(kFeatureTypes, &feature_types_)); + + OP_REQUIRES_OK(ctx, ctx->GetAttr(kSparseDtypes, &sparse_dtypes_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr(kSparseShapes, &sparse_shapes_)); + + OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputDtypes, &output_dtypes_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_)); + + auto feature_num = feature_keys_.size(); + OP_REQUIRES(ctx, feature_num == feature_types_.size(), + errors::InvalidArgument(strings::StrCat( + "The length of feature_keys must equal to the ", + "length of feature_types. [", feature_num, + " != ", feature_types_.size(), "]"))); + + OP_REQUIRES(ctx, feature_num == output_dtypes_.size(), + errors::InvalidArgument(strings::StrCat( + "The length of feature_keys must equal to the ", + "length of output_dtypes. [", feature_num, + " != ", output_dtypes_.size(), "]"))); + + OP_REQUIRES(ctx, feature_num == output_shapes_.size(), + errors::InvalidArgument(strings::StrCat( + "The length of feature_keys must equal to the ", + "length of output_shapes. [", feature_num, + " != ", output_shapes_.size(), "]"))); + + size_t num_sparse = 0; + for (auto& type : feature_types_) { + OP_REQUIRES( + ctx, type == kDenseType || type == kSparseType || type == kVarlenType, + errors::InvalidArgument(strings::StrCat( + "Invalid feature_type, '", type, "'. Only ", kDenseType, ", ", + kSparseType, ", and ", kVarlenType, " are supported."))); + if (type == kSparseType || type == kVarlenType) { + num_sparse++; + } + } + + OP_REQUIRES(ctx, sparse_dtypes_.size() == num_sparse, + errors::InvalidArgument(strings::StrCat( + "The length of sparse_dtypes must equal to the number of ", + "sparse features configured in feature_types. [", + sparse_dtypes_.size(), " != ", num_sparse, "]"))); + + OP_REQUIRES(ctx, sparse_shapes_.size() == num_sparse, + errors::InvalidArgument(strings::StrCat( + "The length of sparse_shapes must equal to the number of ", + "sparse features configured in feature_types. [", + sparse_shapes_.size(), " != ", num_sparse, "]"))); +} + +void ATDSDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) { + const Tensor* filenames_tensor; + OP_REQUIRES_OK(ctx, ctx->input(kFileNames, &filenames_tensor)); + OP_REQUIRES( + ctx, filenames_tensor->dims() <= 1, + errors::InvalidArgument("`filenames` must be a scalar or a vector.")); + + std::vector filenames; + filenames.reserve(filenames_tensor->NumElements()); + for (int i = 0; i < filenames_tensor->NumElements(); ++i) { + VLOG(2) << "Reading file: " << filenames_tensor->flat()(i); + filenames.push_back(filenames_tensor->flat()(i)); + } + + int64 batch_size = 0; + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kBatchSize, &batch_size)); + OP_REQUIRES( + ctx, batch_size > 0, + errors::InvalidArgument(strings::StrCat( + "`batch_size` must be greater than 0 but found ", batch_size))); + + bool drop_remainder = false; + OP_REQUIRES_OK( + ctx, ParseScalarArgument(ctx, kDropRemainder, &drop_remainder)); + + int64 reader_buffer_size = 0; + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kReaderBufferSize, + &reader_buffer_size)); + OP_REQUIRES(ctx, reader_buffer_size > 0, + errors::InvalidArgument(strings::StrCat( + "`reader_buffer_size` must be greater than 0 but found ", + reader_buffer_size))); + + int64 shuffle_buffer_size = 0; + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kShuffleBufferSize, + &shuffle_buffer_size)); + OP_REQUIRES( + ctx, shuffle_buffer_size >= 0, + errors::InvalidArgument(strings::StrCat( + "`shuffle_buffer_size` must be greater than or equal to 0 but found ", + shuffle_buffer_size))); + + int64 num_parallel_calls = 0; + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kNumParallelCalls, + &num_parallel_calls)); + OP_REQUIRES(ctx, + num_parallel_calls > 0 || + num_parallel_calls == tensorflow::data::model::kAutotune, + errors::InvalidArgument( + strings::StrCat("`num_parallel_calls` must be a positive " + "integer or tf.data.AUTOTUNE, got ", + num_parallel_calls))); + *output = new Dataset( + ctx, std::move(filenames), batch_size, drop_remainder, reader_buffer_size, + shuffle_buffer_size, num_parallel_calls, feature_keys_, feature_types_, + sparse_dtypes_, sparse_shapes_, output_dtypes_, output_shapes_); +} + +namespace { +REGISTER_KERNEL_BUILDER(Name("IO>ATDSDataset").Device(DEVICE_CPU), + ATDSDatasetOp); +} // namespace + +} // namespace data +} // namespace tensorflow diff --git a/tensorflow_io/core/kernels/avro/atds_dataset_kernels.h b/tensorflow_io/core/kernels/avro/atds_dataset_kernels.h new file mode 100644 index 000000000..22bec7d6c --- /dev/null +++ b/tensorflow_io/core/kernels/avro/atds_dataset_kernels.h @@ -0,0 +1,60 @@ +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DATASET_OP_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DATASET_OP_H_ + +#include "tensorflow/core/framework/dataset.h" + +namespace tensorflow { +namespace data { + +class ATDSDatasetOp : public DatasetOpKernel { + public: + static constexpr const char* const kDatasetType = "ATDSDatum"; + static constexpr const char* const kFileNames = "filenames"; + static constexpr const char* const kBatchSize = "batch_size"; + static constexpr const char* const kDropRemainder = "drop_remainder"; + static constexpr const char* const kReaderBufferSize = "reader_buffer_size"; + static constexpr const char* const kShuffleBufferSize = "shuffle_buffer_size"; + static constexpr const char* const kNumParallelCalls = "num_parallel_calls"; + static constexpr const char* const kFeatureKeys = "feature_keys"; + static constexpr const char* const kFeatureTypes = "feature_types"; + static constexpr const char* const kSparseDtypes = "sparse_dtypes"; + static constexpr const char* const kSparseShapes = "sparse_shapes"; + static constexpr const char* const kOutputDtypes = "output_dtypes"; + static constexpr const char* const kOutputShapes = "output_shapes"; + + static constexpr const char* const kDenseType = "dense"; + static constexpr const char* const kSparseType = "sparse"; + static constexpr const char* const kVarlenType = "varlen"; + + explicit ATDSDatasetOp(OpKernelConstruction* ctx); + + protected: + void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override; + + private: + class Dataset; + + std::vector feature_keys_, feature_types_; + std::vector sparse_dtypes_, output_dtypes_; + std::vector sparse_shapes_, output_shapes_; +}; + +} // namespace data +} // namespace tensorflow + +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_ATDS_DATASET_OP_H_ diff --git a/tensorflow_io/core/kernels/avro/avro_record_dataset_kernels.h b/tensorflow_io/core/kernels/avro/avro_record_dataset_kernels.h index 3d1a8668a..0c94930c0 100644 --- a/tensorflow_io/core/kernels/avro/avro_record_dataset_kernels.h +++ b/tensorflow_io/core/kernels/avro/avro_record_dataset_kernels.h @@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_DATA_CORE_KERNELS_AVRO_RECORD_DATASET_OP_H_ -#define TENSORFLOW_DATA_CORE_KERNELS_AVRO_RECORD_DATASET_OP_H_ +#ifndef TENSORFLOW_IO_CORE_KERNELS_AVRO_RECORD_DATASET_OP_H_ +#define TENSORFLOW_IO_CORE_KERNELS_AVRO_RECORD_DATASET_OP_H_ #include "tensorflow/core/framework/dataset.h" #include "tensorflow/core/framework/dataset_metadata.pb.h" @@ -38,4 +38,4 @@ class AvroRecordDatasetOp : public DatasetOpKernel { } // namespace data } // namespace tensorflow -#endif // TENSORFLOW_DATA_CORE_KERNELS_AVRO_RECORD_DATASET_OP_H_ +#endif // TENSORFLOW_IO_CORE_KERNELS_AVRO_RECORD_DATASET_OP_H_ diff --git a/tensorflow_io/core/ops/avro_ops.cc b/tensorflow_io/core/ops/avro_ops.cc index 658bdeb3d..8fc26733e 100644 --- a/tensorflow_io/core/ops/avro_ops.cc +++ b/tensorflow_io/core/ops/avro_ops.cc @@ -322,4 +322,38 @@ REGISTER_OP("IO>AvroReadablePartitions") return OkStatus(); }); +REGISTER_OP("IO>ATDSDataset") + .Input("filenames: string") + .Input("batch_size: int64") + .Input("drop_remainder: bool") + .Input("reader_buffer_size: int64") + .Input("shuffle_buffer_size: int64") + .Input("num_parallel_calls: int64") + .Output("handle: variant") + .Attr("feature_keys: list(string) >= 0") + .Attr("feature_types: list(string) >= 0") + .Attr("sparse_dtypes: list({float,double,int64,int32,string,bool}) >= 0") + .Attr("sparse_shapes: list(shape) >= 0") + .Attr( + "output_dtypes: list({float,double,int64,int32,string,bool,variant}) " + ">= 0") + .Attr("output_shapes: list(shape) >= 0") + .SetIsStateful() + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle unused; + // `filenames` must be a scalar or a vector + TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused)); + // `batch_size` must be a scalar + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused)); + // `drop_remainder` must be a scalar + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused)); + // `reader_buffer_size` must be a scalar + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused)); + // `shuffle_buffer_size` must be a scalar + TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused)); + // `num_parallel_calls` must be a scalar + TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused)); + return shape_inference::ScalarShape(c); + }); + } // namespace tensorflow diff --git a/tensorflow_io/python/experimental/atds/__init__.py b/tensorflow_io/python/experimental/atds/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tensorflow_io/python/experimental/atds/dataset.py b/tensorflow_io/python/experimental/atds/dataset.py new file mode 100644 index 000000000..f627ef9c4 --- /dev/null +++ b/tensorflow_io/python/experimental/atds/dataset.py @@ -0,0 +1,300 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ATDSDataset""" + +from typing import Optional + +import tensorflow as tf +from tensorflow.python.data.util import convert +from tensorflow.python.data.util import structure +from tensorflow.python.data.util import nest +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import errors +from tensorflow.python.framework import tensor_util + +from tensorflow_io.python.ops import core_ops +from tensorflow_io.python.experimental.atds.features import ( + DenseFeature, + SparseFeature, + VarlenFeature, +) + +# Argument default values used in ATDS Dataset. +_DEFAULT_DROP_REMAINDER = False # Do not drop last batch. +_DEFAULT_READER_BUFFER_SIZE_BYTES = 128 * 1024 # 128 KB +_DEFAULT_SHUFFLE_BUFFER_SIZE_EXAMPLES = 0 # shuffle is disabled. +_DEFAULT_NUM_PARALLEL_CALLS = 1 # process sequentially. + +# Feature type name used in ATDS Dataset Op. +_DENSE_FEATURE_TYPE = "dense" +_SPARSE_FEATURE_TYPE = "sparse" +_VARLEN_FEATURE_TYPE = "varlen" + +# Supported feature configs +_SUPPORTED_FEATURE_CONFIG = (DenseFeature, SparseFeature, VarlenFeature) + + +class ATDSDataset(dataset_ops.DatasetSource): + """A `Dataset` comprising records from one or more Avro files. + + This dataset load Avro records from the files into a dict of tensors. + The output dict has feature name as key and tf.Tensor or tf.SparseTensor + as value. The output tensor values are batched with the user defined + batch size. + + Shuffle can be enabled before batch by configuring shuffle buffer size. + The shuffle buffer size dictates the elements *in addition* to the batch size + that would be read and sampled. + This dataset keeps collecting Avro blocks(a sequence of Avro records), + until the total number of unread records is greater than the shuffle + buffer size + batch_size, then randomly samples block from the collected blocks. + An Avro Record from the sampled block will be parsed and batched into + the output tensors. + + For instance, assume your dataset contains 5 blocks with 100 records in + each block. When the batch size is set to 32 and shuffle buffer size is set + to 128, this dataset will collect two blocks as the two blocks contains more + than 128 + 32 = 160 unread records, and randomly samples block from the two + blocks 32 times. + When a block is sampled, a record in the sampled block is read and batched + into the output tensor dict until all records in the sampled block are read. + If only one block fits into the batch + shuffle_buffer_size, records in that + block will be read sequentially without shuffle. Users can increase the + shuffle buffer size or apply dataset unbatch, shuffle, and batch for better shuffling. + + The memory footprint of this shuffle buffer is signficantly different from tf.data.Dataset.shuffle + In Tensorflow's shuffle, the shuffle buffer specifies a separate buffer of elements to pick + random elements from. In this implementation, the shuffle buffer + batch forms the total number of + elements that would be read for sampling. + + Here's an example comparing shuffle between AvroReader and Tensorflow: + + Data size is 1000 + Batch size is 64 + + Case 1: perfect shuffle + Shuffle buffer size 1000 + TF: shuffle elements(1, 1000) to create a batch of size 64 + AvroReader: shuffle elements(1, 1000) to create a batch of size 64 + + Case 2: not perfect but shuffle > batch + Shuffle buffer size is 256 + TF: + Shuffle elements(1, 256) to pick 1 element + Shuffle elements(1, 257) to pick 1 element + … + shuffle(1, 320) to create a batch of size 64 + + AvroReader: + Shuffle elements(1, 320) to create a batch of size 64 + + Case 3: shuffle buffer < batch + Shuffle buffer size is 32 + TF: + Shuffle elements(1, 32) to pick 1 element + Shuffle elements(1, 33) to pick 1 element + … + shuffle(1, 96) to create a batch of size 64 + + ATDS: Shuffle elements(1, 96) to create a batch of size 64 + + Case 4: no shuffle + Shuffle buffer size is 0 + Tensorflow and ATDS both will just directly read to create a batch of size 64 + + + A minimal example is given below: + + >>> import tempfile + >>> import avro.schema + >>> from avro.datafile import DataFileWriter + >>> from avro.io import DatumWriter + >>> from tensorflow_io.python.experimental.atds.dataset import ATDSDataset + >>> from tensorflow_io.python.experimental.atds.features import DenseFeature + >>> example_path = os.path.join(tempfile.gettempdir(), "example.avro") + >>> np.random.seed(0) + + >>> # Define Avro schema in ATDS format. + >>> json_schema = '''{ + ... "type": "record", + ... "name": "example", + ... "fields": [ + ... { "name": "x", "type": "float" }, + ... { "name": "y", "type": "float" } + ... ] + ... }''' + >>> schema = avro.schema.Parse(json_schema) + + >>> # Write the Avro records to a file. + >>> with open(example_path, "wb") as f: + ... writer = DataFileWriter(f, DatumWriter(), schema) + ... for _ in range(3): + ... x, y = np.random.random(), np.random.random() + ... writer.append({"x": x, "y": y}) + ... writer.close() + + >>> # Read the data back out. + >>> feature_config = { + ... "x": DenseFeature([], dtype=tf.float32), + ... "y": DenseFeature([], dtype=tf.float32) + ... } + >>> for batch in ATDSDataset([example_path], batch_size=2, + ... features=feature_config): + ... print("x = {x}, y = {y}".format(**batch)) + x = [0.5488135 0.60276335], y = [0.71518934 0.5448832 ] + x = [0.4236548], y = [0.6458941] + """ + + def __init__( + self, + filenames, + batch_size, + features, + drop_remainder=False, + reader_buffer_size=None, + shuffle_buffer_size=None, + num_parallel_calls=None, + ): + """Creates a `ATDSDataset` to read one or more Avro files encoded with + ATDS Schema. + + Each element of the dataset contains an Avro Record that will be + parsed into a dict of tensors. + + Args: + filenames: A `tf.string` tensor containing one or more filenames. + batch_size: A `tf.int64` scalar representing the number of records to + read and parse per iteration. + features: A feature configuration dict with feature name as key and + ATDS feature as value. ATDS features can be one of the DenseFeature, + SparseFeature, or VarlenFeature. See + tensorflow_io.python.experimental.atds.features for more details. + drop_remainder: (Optional.) A `tf.bool` scalar tf.Tensor, representing + whether the last batch should be dropped in the case it has fewer + than batch_size elements. The default behavior is not to drop the + smaller batch. + reader_buffer_size: (Optional.) A `tf.int64` scalar representing the + number of bytes used in the file content buffering. + shuffle_buffer_size: (Optional.) A `tf.int64` scalar representing the + number of records to shuffle together before batching. If not + specified, data is batched without shuffle. + num_parallel_calls: (Optional.) A `tf.int64` scalar representing the + maximum thread number used in the dataset. If greater than one, + records in files are processed in parallel with deterministic order. + The number will be truncated when it is greater than the maximum + available parallelism number on the host. If set to `tf.data.AUTOTUNE`, + number of threads will be adjusted dynamically based on workload and + available resources. If not specified, records will be processed sequentially. + + Raises: + TypeError: If any argument does not have the expected type. + ValueError: If any argument does not have the expected shape + or features have invalid config. + """ + self._filenames = filenames + self._batch_size = batch_size + self._drop_remainder = convert.optional_param_to_tensor( + "drop_remainder", + drop_remainder, + argument_default=_DEFAULT_DROP_REMAINDER, + argument_dtype=tf.bool, + ) + self._reader_buffer_size = convert.optional_param_to_tensor( + "reader_buffer_size", + reader_buffer_size, + argument_default=_DEFAULT_READER_BUFFER_SIZE_BYTES, + ) + self._shuffle_buffer_size = convert.optional_param_to_tensor( + "shuffle_buffer_size", + shuffle_buffer_size, + argument_default=_DEFAULT_SHUFFLE_BUFFER_SIZE_EXAMPLES, + ) + self._num_parallel_calls = convert.optional_param_to_tensor( + "num_parallel_calls", + num_parallel_calls, + argument_default=_DEFAULT_NUM_PARALLEL_CALLS, + ) + + if features is None or not isinstance(features, dict): + raise ValueError( + f"Features can only be a dict with feature name as key" + f" and ATDS feature configuration as value but found {features}." + f" Available feature configuration are {_SUPPORTED_FEATURE_CONFIG}." + ) + if not features: + raise ValueError( + "Features dict cannot be empty and should have at " "least one feature." + ) + + feature_keys = [] + feature_types = [] + sparse_dtypes = [] + sparse_shapes = [] + + element_spec = {} + for key in sorted(features): + feature = features[key] + if not isinstance(feature, _SUPPORTED_FEATURE_CONFIG): + raise ValueError( + f"Unknown ATDS feature configuration {feature}. " + f"Only {_SUPPORTED_FEATURE_CONFIG} are supported." + ) + + feature_keys.append(key) + shape = [dim if dim != -1 else None for dim in feature.shape] + if isinstance(feature, DenseFeature): + feature_types.append(_DENSE_FEATURE_TYPE) + element_spec[key] = tf.TensorSpec(shape, feature.dtype) + elif isinstance(feature, SparseFeature): + feature_types.append(_SPARSE_FEATURE_TYPE) + sparse_dtypes.append(feature.dtype) + sparse_shapes.append(shape) + element_spec[key] = tf.SparseTensorSpec(shape, feature.dtype) + elif isinstance(feature, VarlenFeature): + feature_types.append(_VARLEN_FEATURE_TYPE) + sparse_dtypes.append(feature.dtype) + sparse_shapes.append(shape) + element_spec[key] = tf.SparseTensorSpec(shape, feature.dtype) + + constant_drop_remainder = tensor_util.constant_value(self._drop_remainder) + if constant_drop_remainder: + constant_batch_size = tensor_util.constant_value(self._batch_size) + self._element_spec = nest.map_structure( + lambda spec: spec._batch(constant_batch_size), element_spec + ) + else: + self._element_spec = nest.map_structure( + lambda spec: spec._batch(None), element_spec + ) + + variant_tensor = core_ops.io_atds_dataset( + filenames=self._filenames, + batch_size=self._batch_size, + drop_remainder=self._drop_remainder, + reader_buffer_size=self._reader_buffer_size, + shuffle_buffer_size=self._shuffle_buffer_size, + num_parallel_calls=self._num_parallel_calls, + feature_keys=feature_keys, + feature_types=feature_types, + sparse_dtypes=sparse_dtypes, + sparse_shapes=sparse_shapes, + output_dtypes=structure.get_flat_tensor_types(self._element_spec), + output_shapes=structure.get_flat_tensor_shapes(self._element_spec), + ) + super().__init__(variant_tensor) + + @property + def element_spec(self): + return self._element_spec diff --git a/tensorflow_io/python/experimental/atds/features.py b/tensorflow_io/python/experimental/atds/features.py new file mode 100644 index 000000000..a599d6ac8 --- /dev/null +++ b/tensorflow_io/python/experimental/atds/features.py @@ -0,0 +1,115 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import collections +from typing import List + +import tensorflow as tf + +ATDS_SUPPORTED_DTYPES = [tf.int32, tf.int64, tf.float32, tf.float64, tf.string, tf.bool] + + +def _raise_error_if_dtype_not_supported(dtype: tf.dtypes.DType): + if dtype not in ATDS_SUPPORTED_DTYPES: + raise ValueError( + f"{dtype} is not supported in ATDS. " + f"Available dtypes are {ATDS_SUPPORTED_DTYPES}." + ) + + +def _raise_error_if_shape_is_none(shape: List[int]): + if shape is None: + raise ValueError(f"Shape cannot be None.") + + +def _raise_error_if_shape_has_none_or_zero(shape: List[int]): + for dim in shape: + if dim is None or dim == 0: + raise ValueError( + f"Dimension in shape cannot be None or 0 but found {shape}." + ) + + +def _raise_error_if_dtype_is_none(dtype: tf.dtypes.DType): + if dtype is None: + raise ValueError(f"dtype cannot be None.") + + +def _validate_shape_and_dtype(shape: List[int], dtype: tf.dtypes.DType): + _raise_error_if_shape_is_none(shape) + _raise_error_if_shape_has_none_or_zero(shape) + _raise_error_if_dtype_is_none(dtype) + _raise_error_if_dtype_not_supported(dtype) + + +class DenseFeature(collections.namedtuple("DenseFeature", ["shape", "dtype"])): + """ + Configuration for reading and parsing a tf.Tensor encoded with + ATDS dense feature schema. + + Fields: + shape: Shape of input data. Each dimension should be positive. + dtype: Data type of input. + """ + + def __new__(cls, shape: List[int], dtype: tf.dtypes.DType): + _validate_shape_and_dtype(shape, dtype) + for dim in shape: + if dim <= 0: + raise ValueError( + f"Each dimension should be greater than 0" + f" in DenseFeature but found {shape}." + ) + + return super().__new__(cls, shape, dtype) + + +class SparseFeature(collections.namedtuple("SparseFeature", ["shape", "dtype"])): + """ + Configuration for reading and parsing a tf.SparseTensor encoded with + ATDS sparse feature schema. + + Fields: + shape: Shape of input data. shape cannot be empty. + dtype: Data type of input. + """ + + def __new__(cls, shape: List[int], dtype: tf.dtypes.DType): + _validate_shape_and_dtype(shape, dtype) + if len(shape) == 0: + raise ValueError("SparseFeature cannot be scalar.") + + return super().__new__(cls, shape, dtype) + + +class VarlenFeature(collections.namedtuple("VarlenFeature", ["shape", "dtype"])): + """ + Configuration for reading and parsing a tf.SparseTensor encoded with + ATDS ragged feature schema. + + Fields: + shape: Shape of input data. Use -1 as unknown dimension. + dtype: Data type of input. + """ + + def __new__(cls, shape: List[int], dtype: tf.dtypes.DType): + _validate_shape_and_dtype(shape, dtype) + for dim in shape: + if dim <= 0 and dim != -1: + raise ValueError( + f"Each dimension should be greater than 0 or " + f"-1 in VarlenFeature but found {shape}." + ) + + return super().__new__(cls, shape, dtype) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_atds_avro/__init__.py b/tests/test_atds_avro/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_atds_avro/benchmark/test_dense_tensor_atds_benchmark.py b/tests/test_atds_avro/benchmark/test_dense_tensor_atds_benchmark.py new file mode 100644 index 000000000..62864ba6f --- /dev/null +++ b/tests/test_atds_avro/benchmark/test_dense_tensor_atds_benchmark.py @@ -0,0 +1,313 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +"""ATDSDataset benchmark with dense tensors.""" + +import pytest +import tensorflow as tf + +from tests.test_atds_avro.utils.data_source_registry import ( + TensorType, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, +) +from tests.test_atds_avro.utils.atds_benchmark_utils import run_atds_benchmark + + +@pytest.mark.benchmark( + group="dense_int32_0d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_int32_0d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 0, + tf.int32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_int32_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_int32_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 1, + tf.int32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_int32_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_int32_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 2, + tf.int32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_int64_0d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_int64_0d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 0, + tf.int64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_int64_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_int64_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 1, + tf.int64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_int64_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_int64_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 2, + tf.int64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_float32_0d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_float32_0d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 0, + tf.float32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_float32_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_float32_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 1, + tf.float32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_float32_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_float32_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 2, + tf.float32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_float64_0d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_float64_0d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 0, + tf.float64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_float64_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_float64_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 1, + tf.float64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_float64_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_float64_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 2, + tf.float64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_string_0d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_string_0d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 0, + tf.string, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_string_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_string_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 1, + tf.string, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_string_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_string_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 2, + tf.string, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_bool_0d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_bool_0d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 0, + tf.bool, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_bool_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_bool_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 1, + tf.bool, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="dense_bool_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_dense_bool_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.DENSE, + 2, + tf.bool, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) diff --git a/tests/test_atds_avro/benchmark/test_sparse_tensor_atds_benchmark.py b/tests/test_atds_avro/benchmark/test_sparse_tensor_atds_benchmark.py new file mode 100644 index 000000000..1a3dcfe65 --- /dev/null +++ b/tests/test_atds_avro/benchmark/test_sparse_tensor_atds_benchmark.py @@ -0,0 +1,217 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +"""ATDSDataset benchmark with sparse tensors.""" + +import pytest +import tensorflow as tf + +from tests.test_atds_avro.utils.data_source_registry import ( + TensorType, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, +) +from tests.test_atds_avro.utils.atds_benchmark_utils import run_atds_benchmark + + +@pytest.mark.benchmark( + group="sparse_int32_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_int32_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 1, + tf.int32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_int32_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_int32_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 2, + tf.int32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_int64_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_int64_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 1, + tf.int64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_int64_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_int64_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 2, + tf.int64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_float32_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_float32_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 1, + tf.float32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_float32_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_float32_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 2, + tf.float32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_float64_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_float64_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 1, + tf.float64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_float64_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_float64_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 2, + tf.float64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_string_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_string_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 1, + tf.string, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_string_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_string_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 2, + tf.string, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_bool_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_bool_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 1, + tf.bool, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="sparse_bool_2d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_sparse_bool_2d(batch_size, benchmark): + run_atds_benchmark( + TensorType.SPARSE, + 2, + tf.bool, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) diff --git a/tests/test_atds_avro/benchmark/test_varlen_tensor_atds_benchmark.py b/tests/test_atds_avro/benchmark/test_varlen_tensor_atds_benchmark.py new file mode 100644 index 000000000..097b46a88 --- /dev/null +++ b/tests/test_atds_avro/benchmark/test_varlen_tensor_atds_benchmark.py @@ -0,0 +1,121 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +"""ATDSDataset benchmark with varlen tensors.""" + +import pytest +import tensorflow as tf + +from tests.test_atds_avro.utils.data_source_registry import ( + TensorType, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, +) +from tests.test_atds_avro.utils.atds_benchmark_utils import run_atds_benchmark + + +@pytest.mark.benchmark( + group="varlen_int32_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_varlen_int32_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.VARLEN, + 1, + tf.int32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="varlen_int64_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_varlen_int64_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.VARLEN, + 1, + tf.int64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="varlen_float32_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_varlen_float32_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.VARLEN, + 1, + tf.float32, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="varlen_float64_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_varlen_float64_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.VARLEN, + 1, + tf.float64, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="varlen_string_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_varlen_string_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.VARLEN, + 1, + tf.string, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) + + +@pytest.mark.benchmark( + group="varlen_bool_1d", +) +@pytest.mark.parametrize("batch_size", [(128)]) +def test_varlen_bool_1d(batch_size, benchmark): + run_atds_benchmark( + TensorType.VARLEN, + 1, + tf.bool, + SMALL_NUM_RECORDS, + SINGLE_PARTITION, + batch_size, + benchmark, + ) diff --git a/tests/test_atds_avro/utils/__init__.py b/tests/test_atds_avro/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_atds_avro/utils/atds_benchmark_utils.py b/tests/test_atds_avro/utils/atds_benchmark_utils.py new file mode 100644 index 000000000..11306b053 --- /dev/null +++ b/tests/test_atds_avro/utils/atds_benchmark_utils.py @@ -0,0 +1,196 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +"""Utility functions for ATDS benchmarks.""" + +import avro.schema +import glob +import json +import os +import tensorflow as tf + +from tests.test_atds_avro.utils.data_source import DataSource +from tests.test_atds_avro.utils.data_source_registry import ( + SMALL_NUM_RECORDS, + get_canonical_name, + get_data_source_from_registry, +) +from tests.test_atds_avro.utils.generator.tensor_generator import ( + IntTensorGenerator, + FloatTensorGenerator, + WordTensorGenerator, + BoolTensorGenerator, +) +from tests.test_atds_avro.utils.generator.sparse_tensor_generator import ( + IntSparseTensorGenerator, + FloatSparseTensorGenerator, + WordSparseTensorGenerator, + BoolSparseTensorGenerator, + get_common_value_dist, +) +from tests.test_atds_avro.utils.generator.varlen_tensor_generator import ( + IntVarLenTensorGenerator, + FloatVarLenTensorGenerator, + WordVarLenTensorGenerator, + BoolVarLenTensorGenerator, +) + + +from tensorflow_io.python.experimental.atds.dataset import ATDSDataset +from tests.test_atds_avro.utils.atds_writer import ATDSWriter +from tests.test_atds_avro.utils.benchmark_utils import benchmark_func + + +_AVRO_TO_DTYPE = { + "int": tf.int32, + "long": tf.int64, + "float": tf.float32, + "double": tf.float64, + "boolean": tf.bool, + "string": tf.string, +} + +_AVRO_TO_DENSE_TENSOR_GENERATOR = { + "int": IntTensorGenerator, + "long": IntTensorGenerator, + "float": FloatTensorGenerator, + "double": FloatTensorGenerator, + "boolean": BoolTensorGenerator, + "string": WordTensorGenerator, +} + +_AVRO_TO_SPARSE_TENSOR_GENERATOR = { + "int": IntSparseTensorGenerator, + "long": IntSparseTensorGenerator, + "float": FloatSparseTensorGenerator, + "double": FloatSparseTensorGenerator, + "boolean": BoolSparseTensorGenerator, + "string": WordSparseTensorGenerator, +} + +_AVRO_TO_VARLEN_TENSOR_GENERATOR = { + "int": IntVarLenTensorGenerator, + "long": IntVarLenTensorGenerator, + "float": FloatVarLenTensorGenerator, + "double": FloatVarLenTensorGenerator, + "boolean": BoolVarLenTensorGenerator, + "string": WordVarLenTensorGenerator, +} + + +def get_features_from_data_source(writer, data_source): + """Generates a dict of features from data source object + + Args: + writer: ATDSWriter object + data_source: DataSource object + """ + scenario = data_source.scenario + features = { + feature_name: writer._get_atds_feature(scenario[feature_name]) + for feature_name in scenario + } + return features + + +def get_dataset( + files, + features, + batch_size=1, + shuffle_buffer_size=0, + parallelism=os.cpu_count(), + interleave_parallelism=0, +): + """Generates a tf.data.Dataset from a datasource + + Args: + files: A list of files + features: Dict of features + batch_size: (Optional.) Batch size for ATDS dataset + shuffle_buffer_size: (Optional.) Size of the buffer used for shuffling. See + tensorflow_io/python/experimental/atds/dataset.py for details. + If unspecified, data is not shuffled. + parallelism: (Optional.) Number of threads to use while decoding. Defaults + to all available cores. + """ + if interleave_parallelism == 0: + dataset = ATDSDataset( + filenames=files, + batch_size=batch_size, + features=features, + shuffle_buffer_size=shuffle_buffer_size, + num_parallel_calls=parallelism, + ) + else: + dataset = tf.data.Dataset.list_files(files) + dataset = dataset.interleave( + lambda filename: ATDSDataset( + filenames=filename, + batch_size=batch_size, + features=features, + shuffle_buffer_size=shuffle_buffer_size, + num_parallel_calls=parallelism, + ), + cycle_length=interleave_parallelism, + num_parallel_calls=interleave_parallelism, + ) + return dataset.prefetch(1) + + +def _is_fully_defined_shape(shape): + return -1 not in shape + + +def run_atds_benchmark( + tensor_type, rank, dtype, num_records, partitions, batch_size, benchmark +): + data_source_name = get_canonical_name( + tensor_type, rank, dtype, num_records, partitions + ) + data_source = get_data_source_from_registry(data_source_name) + run_atds_benchmark_from_data_source(data_source, batch_size, benchmark) + + +def run_atds_benchmark_from_data_source( + data_source, + batch_size, + benchmark, + parallelism=tf.data.AUTOTUNE, + interleave_parallelism=0, + codec="null", + shuffle_buffer_size=0, + rounds=30, +): + with ATDSWriter(codec=codec) as writer: + dir_path = writer.write(data_source) + pattern = os.path.join(dir_path, f"*.{writer.extension}") + + dataset = get_dataset( + glob.glob(pattern), + get_features_from_data_source(writer, data_source), + batch_size=batch_size, + shuffle_buffer_size=shuffle_buffer_size, + parallelism=parallelism, + interleave_parallelism=interleave_parallelism, + ) + count = benchmark.pedantic( + target=benchmark_func, + args=[dataset], + iterations=2, + # pytest-benchmark calculates statistic across rounds. Set it with + # larger number (N > 30) for test statistic. + rounds=rounds, + kwargs={}, + ) + assert count > 0, f"ATDS record count: {count} must be greater than 0" diff --git a/tests/test_atds_avro/utils/atds_writer.py b/tests/test_atds_avro/utils/atds_writer.py new file mode 100644 index 000000000..564ec1e44 --- /dev/null +++ b/tests/test_atds_avro/utils/atds_writer.py @@ -0,0 +1,304 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +import os +import hashlib +import json +import numpy as np +import tensorflow as tf + +from avro.schema import Parse as parse +from avro.datafile import DataFileWriter +from avro.io import DatumWriter + +from tests.test_atds_avro.utils.file_writer import FileWriter +from tests.test_atds_avro.utils.generator.varlen_tensor_generator import ( + VarLenTensorGeneratorBase, +) +from tensorflow_io.python.experimental.atds.features import ( + DenseFeature, + SparseFeature, + VarlenFeature, +) + + +class ATDSWriter(FileWriter): + # ATDSWriter generates Avro data from input DataSource. + _BOOL_LIST_DTYPE = [tf.bool] + _INT64_LIST_DTYPE = [tf.int32, tf.int64, tf.uint32, tf.uint64] + _FLOAT_LIST_DTYPE = [tf.float32, tf.float64] + _BYTES_LIST_DTYPE = [tf.string] + _AVRORECORD_EXTENSION = "avro" + _SPARSE_INDICES_KEY = "indices" + _SPARSE_VALUES_KEY = "values" + + _DTYPE_TO_AVRO = { + tf.float32: "float", + tf.float64: "double", + tf.int32: "int", + tf.int64: "long", + tf.bool: "boolean", + tf.string: "bytes", + } + + _AVRO_TO_SPARSE_TENSOR = { + "int": "IntSparseTensor", + "long": "LongSparseTensor", + "double": "DoubleSparseTensor", + "float": "FloatSparseTensor", + "string": "StringSparseTensor", + "bytes": "BytesSparseTensor", + "boolean": "BoolSparseTensor", + } + + def __init__(self, codec="null"): + """Create a new FileWriter. + + This must be called by the constructors of subclasses. + """ + super().__init__() + self._codec = codec + + @property + def extension(self): + """Return the file extension of the written files.""" + return ATDSWriter._AVRORECORD_EXTENSION + + def hash_code(self): + """Return the hashed code of this file writer""" + hash_code = super().hash_code() + + m = hashlib.sha256() + m.update(hash_code.encode()) + m.update(self._codec.encode()) + return m.hexdigest() + + def _write_to_path_from_cached_data(self, dir_path, data_source, dataset): + if os.path.exists(dir_path): + return dir_path + os.makedirs(dir_path) + + scenario = data_source.scenario + schema = parse(self.scenario_to_avro_schema(scenario)) + + filenames_to_num_records = self._get_filenames_to_num_records(data_source) + iterator = iter(dataset) + for filename in sorted(filenames_to_num_records): + file_path = os.path.join(dir_path, filename) + num_records = filenames_to_num_records[filename] + + with open(file_path, "wb") as out: + file_writer = DataFileWriter( + out, DatumWriter(), schema, codec=self._codec + ) + for _ in range(num_records): + features = {} + record = iterator.get_next() + for feature_name in record: + generator = data_source.scenario[feature_name] + # tf.Example only supports tf.float32, tf.int64, and tf.string + # For other dtypes, cast feature into its original dtype. + actual = tf.cast(record[feature_name], generator.spec.dtype) + self._add_tensor_to_features( + generator, feature_name, actual, features + ) + file_writer.append(features) + file_writer.close() + + def _write_to_path(self, dir_path, data_source): + """Generate benchmark data and write the data under the given path. + + Args: + dir_path: A str path to write files to. + data_source: A DataSource objects. + + Raises: + NotImplementedError: If subclass does not overload the function. + """ + scenario = data_source.scenario + schema = parse(self.scenario_to_avro_schema(scenario)) + + filenames_to_num_records = self._get_filenames_to_num_records(data_source) + for filename in sorted(filenames_to_num_records): + file_path = os.path.join(dir_path, filename) + num_records = filenames_to_num_records[filename] + + with open(file_path, "wb") as out: + file_writer = DataFileWriter( + out, DatumWriter(), schema, codec=self._codec + ) + for _ in range(num_records): + features = {} + for feature_name in scenario: + generator = scenario[feature_name] + tensor = generator.generate() + self._add_tensor_to_features( + generator, feature_name, tensor, features + ) + file_writer.append(features) + file_writer.close() + + def _add_tensor_to_features(self, generator, feature_name, tensor, features): + spec = generator.spec + if isinstance(spec, tf.TensorSpec): + self._add_dense_tensor_to_features(feature_name, tensor, features) + elif isinstance(spec, tf.SparseTensorSpec): + if issubclass(generator.get_generator_cls(), VarLenTensorGeneratorBase): + self._add_dense_tensor_to_features( + feature_name, tf.sparse.to_dense(tensor), features + ) + else: + self._add_sparse_tensor_to_features(feature_name, tensor, features) + else: + raise TypeError(f"Spec {spec} is not supported in ATDSWriter") + + def scenario_to_avro_schema(self, scenario): + """ + Goes through a scenario to convert it to an avro schema + + """ + schema = {"type": "record", "name": "row", "fields": []} + for feature_name in scenario: + generator = scenario[feature_name] + spec = generator.spec + if isinstance(spec, tf.TensorSpec): + self._add_dense_feature_schema(feature_name, spec, schema) + elif isinstance(spec, tf.SparseTensorSpec): + if issubclass(generator.get_generator_cls(), VarLenTensorGeneratorBase): + self._add_dense_feature_schema(feature_name, spec, schema) + else: + self._add_sparse_feature_schema(feature_name, spec, "long", schema) + schema_str = json.dumps(schema) + return schema_str + + def _map_tensor_dtype_to_avro_dtype(self, dtype): + """This function maps tensor dtype into the python type suppored by avro.""" + if dtype in ATDSWriter._DTYPE_TO_AVRO: + return ATDSWriter._DTYPE_TO_AVRO[dtype] + else: + raise TypeError(f"Type {dtype} is not supported in Avro.") + + def _add_dense_feature_schema(self, name, spec, schema): + schema["fields"].append( + { + "name": name, + "type": self._add_dense_feature_type( + spec.dtype, len(spec.shape.as_list()) + ), + } + ) + + def _add_dense_feature_type(self, dtype, rank): + # if scalar then convert tf type to python type name + avro_type = self._map_tensor_dtype_to_avro_dtype(dtype) + if rank == 0: + return avro_type + else: + return { + "type": "array", + "items": self._add_dense_feature_type(dtype, rank - 1), + } + + def _infer_sparse_tensor_type(self, dtype): + value_avro_type = self._map_tensor_dtype_to_avro_dtype(dtype) + return ATDSWriter._AVRO_TO_SPARSE_TENSOR[value_avro_type] + + def _add_sparse_feature_schema(self, name, spec, indices_avro_type, schema): + schema["fields"].append( + { + "name": name, + "type": { + "type": "record", + "name": name + "_" + self._infer_sparse_tensor_type(spec.dtype), + "fields": self._add_sparse_feature_fields( + spec.dtype, indices_avro_type, len(spec.shape.as_list()) + ), + }, + } + ) + + def _add_sparse_feature_fields(self, dtype, indices_avro_type, rank): + value_avro_type = self._map_tensor_dtype_to_avro_dtype(dtype) + fields = [] + for dim in range(rank): + indices_name_at_dim = ATDSWriter._SPARSE_INDICES_KEY + str(dim) + fields.append( + { + "name": indices_name_at_dim, + "type": {"type": "array", "items": indices_avro_type}, + } + ) + value_field = { + "name": ATDSWriter._SPARSE_VALUES_KEY, + "type": {"type": "array", "items": value_avro_type}, + } + fields.append(value_field) + return fields + + def _get_flat_value(self, tensor): + rank = len(tensor.shape.as_list()) + value = tensor.numpy() + if ( + tensor.dtype == tf.string and rank == 0 + ): # There is no tolist() method for bytes or string + return value + return value.tolist() + + def _add_dense_tensor_to_features(self, name, tensor, features): + features[name] = self._get_flat_value(tensor) + + def _add_sparse_tensor_to_features(self, name, tensor, features): + rank = len(tensor.shape.as_list()) + indices = tensor.indices.numpy() # indices tensor must be a 2D array + # Split indices array along the second dimension so that the split arrays + # contain the indices for separate dimension. For example, + # indices = [[0, 1], [2, 3], [4, 5]] => + # indices_at_dim0 = [[0], [2], [4]] and + # indices_at_dim1 = [[1], [3], [5]]. + split_indices = np.split(indices, rank, axis=1) + features[name] = {} + for dim in range(rank): + indices_name_at_dim = ATDSWriter._SPARSE_INDICES_KEY + str(dim) + # convert indices to 1d array + features[name][indices_name_at_dim] = split_indices[dim].flatten().tolist() + + features[name][ATDSWriter._SPARSE_VALUES_KEY] = self._get_flat_value( + tensor.values + ) + + def _get_atds_feature(self, generator): + """Build tf.Example parsing config + + Args: + generator: A TensorGenerator, SparseTensorGenerator, or VarLenTensorGenerator for generating data. + + Returns: + DenseFeature, SparseFeature, or VarlenFeature + + Raises: + TypeError: if spec is not tf.TensorSpec or tf.SparseTensorSpec. + """ + spec = generator.spec + if isinstance(spec, tf.TensorSpec): + return DenseFeature(shape=spec.shape.as_list(), dtype=spec.dtype) + elif isinstance(spec, tf.SparseTensorSpec): + if issubclass(generator.get_generator_cls(), VarLenTensorGeneratorBase): + atds_shape = [ + -1 if dim == None else dim for dim in spec.shape.as_list() + ] + return VarlenFeature(shape=atds_shape, dtype=spec.dtype) + else: + return SparseFeature(shape=spec.shape.as_list(), dtype=spec.dtype) + else: + raise TypeError(f"Spec {spec} is not supported in ATDSWriter.") diff --git a/tests/test_atds_avro/utils/benchmark_utils.py b/tests/test_atds_avro/utils/benchmark_utils.py new file mode 100644 index 000000000..6893f717b --- /dev/null +++ b/tests/test_atds_avro/utils/benchmark_utils.py @@ -0,0 +1,125 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +"""Utility functions for benchmarks.""" + +import os +import tensorflow as tf + +from tests.test_atds_avro.utils.data_source_registry import ( + get_canonical_name, + get_data_source_from_registry, +) +from tests.test_atds_avro.utils.generator.tensor_generator import ( + IntTensorGenerator, + FloatTensorGenerator, + BoolTensorGenerator, +) +from tests.test_atds_avro.utils.generator.sparse_tensor_generator import ( + IntSparseTensorGenerator, + ValueDistribution, +) +from tests.test_atds_avro.utils.generator.varlen_tensor_generator import ( + WordVarLenTensorGenerator, + DimensionDistribution, +) +from tests.test_atds_avro.utils.tf_record_writer import TFRecordWriter + +MIXED_TYPES_SCENARIO = { + # simulate scalar int as label. + "int32_0d_dense": IntTensorGenerator(tf.TensorSpec(shape=[], dtype=tf.int32)), + # simulate large sparse categorical ids. + "int64_1d_sparse": IntSparseTensorGenerator( + tf.SparseTensorSpec(shape=[50000], dtype=tf.int32), + ValueDistribution.SINGLE_VALUE, + ), + # simulate 1d float embedding input. + "float32_1d_varlen": FloatTensorGenerator( + tf.TensorSpec(shape=[128], dtype=tf.float32) + ), + # simulate 2d images + "float64_2d_dense": FloatTensorGenerator( + tf.TensorSpec(shape=[32, 32], dtype=tf.float64) + ), + # simulate a sentence with varlen words. + "string_1d_sparse": WordVarLenTensorGenerator( + tf.SparseTensorSpec(shape=[None], dtype=tf.string), + DimensionDistribution.LARGE_DIM, + ), + # simulate concatenated bool wide features. + "bool_1d_dense": BoolTensorGenerator(tf.TensorSpec(shape=[5], dtype=tf.bool)), +} + + +def benchmark_func(dataset): + count = 0 + for _ in dataset: + count += 1 + return count + + +def create_tf_record_dataset( + filenames, parse_function, batch_size, file_parallelism=None, shuffle_buffer_size=0 +): + dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=file_parallelism) + if shuffle_buffer_size > 0: + dataset = dataset.shuffle(shuffle_buffer_size) + dataset = dataset.batch(batch_size) + dataset = dataset.map(parse_function) + dataset = dataset.prefetch(1) + return dataset + + +def run_tf_record_benchmark( + tensor_type, rank, dtype, num_records, partitions, batch_size, benchmark +): + data_source_name = get_canonical_name( + tensor_type, rank, dtype, num_records, partitions + ) + data_source = get_data_source_from_registry(data_source_name) + run_tf_record_benchmark_from_data_source(data_source, batch_size, benchmark) + + +def run_tf_record_benchmark_from_data_source( + data_source, + batch_size, + benchmark, + file_parallelism=None, + shuffle_buffer_size=0, + rounds=100, +): + with TFRecordWriter() as writer: + dir_path = writer.write(data_source) + pattern = os.path.join(dir_path, f"*.{writer.extension}") + filenames = tf.data.Dataset.list_files(pattern) + parse_function = writer.create_tf_example_parser_fn( + data_source, with_batch=True + ) + dataset = create_tf_record_dataset( + filenames, + parse_function, + batch_size, + file_parallelism=file_parallelism, + shuffle_buffer_size=shuffle_buffer_size, + ) + count = benchmark.pedantic( + target=benchmark_func, + args=[dataset], + iterations=2, + # pytest-benchmark calculates statistic across rounds. Set it with + # larger number (N > 30) for test statistic. + rounds=rounds, + kwargs={}, + ) + assert count > 0, f"TF record count: {count} must be greater than 0" diff --git a/tests/test_atds_avro/utils/data_source.py b/tests/test_atds_avro/utils/data_source.py new file mode 100644 index 000000000..ac4b23904 --- /dev/null +++ b/tests/test_atds_avro/utils/data_source.py @@ -0,0 +1,106 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""DataSource""" + +import hashlib + +from tests.test_atds_avro.utils.hash_util import int_to_bytes + + +class DataSource: + """DataSource describes properties in a benchmark data. + + DataSource contains the metadata of a benchmark data including the total + number of records, the number of partitioned files and a scenario. A scenario + defines the features used in benchmark with the feature name, tensor spec, + and Generator used to generate the value. DataSource can be consumed by + FileWriter to generate the benchmark data descrbied by itself. + """ + + def __init__(self, scenario, num_records, partitions=1): + """Create a new DataSource. + + Args: + scenario: A dict with feature name as key and Generator as value. + Scenario defines the features used in benchmark. Generator contains + tensor spec and the distribution to generate the tensor value. + num_records: An int defines total number of records in this data. + partitions: An int defines the number of partitioned files in this data. + Each partition can have different number of records. However, the total + number of records must be num_records. + + Raises: + ValueError: If num_records or partitions is negative or partitions is + zero but num_records is greater than zero. + """ + if num_records < 0: + raise ValueError( + "Number of records in DataSource must not be negative" + f" but got {num_records}." + ) + if partitions < 0: + raise ValueError( + "Partition number in DataSource must not be negative" + f" but got {partitions}." + ) + if partitions == 0 and num_records > 0: + raise ValueError( + "Cannot have zero partitions in DataSource with" + f"non-zero num_records ({num_records})." + ) + + self._scenario = scenario + self._num_records = num_records + self._partitions = partitions + + @property + def scenario(self): + """Return the scenario of the benchmark data. + + The scenario is a dict with feature name as key and Generator as value. + """ + return self._scenario + + @property + def num_records(self): + """Return the total number of records in this data as int.""" + return self._num_records + + @property + def partitions(self): + """Return the number of partitioned files in this data as int.""" + return self._partitions + + def hash_code(self): + """Return the consistent hashed code of the DataSource in hex str. + + The hashed code can be used as the path for data source cache. + + Returns: + A hex str generated by hashing algorithm. + """ + m = hashlib.sha256() + # Step 1: hash sorted scenario dict + for name in sorted(self.scenario): + generator = self.scenario[name] + m.update(name.encode()) + m.update(generator.hash_code().encode()) + + # Step 2: hash num_records and partitions + m.update(int_to_bytes(self.num_records)) + m.update(int_to_bytes(self.partitions)) + + # Step 3: return hashed str in hex. + return m.hexdigest() diff --git a/tests/test_atds_avro/utils/data_source_registry.py b/tests/test_atds_avro/utils/data_source_registry.py new file mode 100644 index 000000000..a9296c9d5 --- /dev/null +++ b/tests/test_atds_avro/utils/data_source_registry.py @@ -0,0 +1,220 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Global DataSource registry with predefined DataSource used in benchmark""" + +from enum import Enum + +import numpy as np +import tensorflow as tf +from tensorflow.python.framework import registry + +from tests.test_atds_avro.utils.data_source import DataSource +from tests.test_atds_avro.utils.generator.tensor_generator import ( + IntTensorGenerator, + FloatTensorGenerator, + WordTensorGenerator, + BoolTensorGenerator, +) +from tests.test_atds_avro.utils.generator.sparse_tensor_generator import ( + IntSparseTensorGenerator, + FloatSparseTensorGenerator, + WordSparseTensorGenerator, + BoolSparseTensorGenerator, + ValueDistribution, +) +from tests.test_atds_avro.utils.generator.varlen_tensor_generator import ( + DimensionDistribution, + IntVarLenTensorGenerator, + FloatVarLenTensorGenerator, + WordVarLenTensorGenerator, + BoolVarLenTensorGenerator, + DIM_DISTRIBUTION_TO_RANGE, +) + +SMALL_NUM_RECORDS = 1024 +LARGE_NUM_RECORDS = 12 * 1024 + +SINGLE_PARTITION = 1 +MULTIPLE_PARTITION = 6 + +FILE_PARALLELISM = 2 + +_data_source_registry = registry.Registry("data source") + +# Data source name with all types of tensors. +ALL_TYPES_DATA_SOURCE_NAME = "all_types_data_source_name" + + +class TensorType(Enum): + """Type of tensors used in benchmark""" + + DENSE = 1 + SPARSE = 2 + VARLEN = 3 + + +def get_canonical_name(tensor_type, rank, dtype, num_records, partitions): + """Get canonical name which is used as key in global data source registry. + + Args: + tensor_type: A TensorType enum. + rank: An int to represent the rank of tensor. + dtype: tf.dtypes.DType. + num_records: Number of records. + partitions: Number of file partitions. + + Returns: + The canonical name to represent such data source in registry. + + Raises: + TypeError: If tensor_type is not TensorType. + ValueError: if rank is unknown or negative. + """ + if not isinstance(tensor_type, TensorType): + raise TypeError( + "Input tensor_type must be a TensorType enum" f" but found {tensor_type}" + ) + + if rank is None or rank < 0: + raise ValueError("Input rank must not be None or negative. Found {rank}.") + + return f"{tensor_type.name}_{rank}D_{dtype.name}_{num_records}_{partitions}" + + +def get_data_source_registry(): + """Get the global data source registry. If the registry is empty, + initialize the registry with predefined data sources.""" + global _data_source_registry + if not _data_source_registry.list(): + _init_data_source_registry(_data_source_registry) + return _data_source_registry + + +def _init_data_source_registry(registry): + shapes = [[], [128], [64, 64]] + dtypes = [tf.int32, tf.int64, tf.float32, tf.float64, tf.string, tf.bool] + + # Register data source with dense tensors. + dense_generators = [ + IntTensorGenerator, + IntTensorGenerator, + FloatTensorGenerator, + FloatTensorGenerator, + WordTensorGenerator, + BoolTensorGenerator, + ] + for cls, dtype in zip(dense_generators, dtypes): + for shape in shapes: + name = get_canonical_name( + TensorType.DENSE, + rank=len(shape), + dtype=dtype, + num_records=SMALL_NUM_RECORDS, + partitions=SINGLE_PARTITION, + ) + generator = cls(tf.TensorSpec(shape=shape, dtype=dtype)) + registry.register( + candidate=DataSource( + scenario={name: generator}, + num_records=SMALL_NUM_RECORDS, + partitions=SINGLE_PARTITION, + ), + name=name, + ) + + # Register data source with sparse tensors. + sparse_generators = [ + IntSparseTensorGenerator, + IntSparseTensorGenerator, + FloatSparseTensorGenerator, + FloatSparseTensorGenerator, + WordSparseTensorGenerator, + BoolSparseTensorGenerator, + ] + for cls, dtype in zip(sparse_generators, dtypes): + for shape in shapes: + if len(shape) == 0: + # Skip scalars for sparse tensors + continue + name = get_canonical_name( + TensorType.SPARSE, + rank=len(shape), + dtype=dtype, + num_records=SMALL_NUM_RECORDS, + partitions=SINGLE_PARTITION, + ) + value_dist = ValueDistribution.SMALL_NUM_VALUE # 5 to 10 elements + generator = cls(tf.SparseTensorSpec(shape=shape, dtype=dtype), value_dist) + registry.register( + candidate=DataSource( + scenario={name: generator}, + num_records=SMALL_NUM_RECORDS, + partitions=SINGLE_PARTITION, + ), + name=name, + ) + + # Register data source with varlen tensors. + varlen_generators = [ + IntVarLenTensorGenerator, + IntVarLenTensorGenerator, + FloatVarLenTensorGenerator, + FloatVarLenTensorGenerator, + WordVarLenTensorGenerator, + BoolVarLenTensorGenerator, + ] + varlen_shapes = [[None]] + for cls, dtype in zip(varlen_generators, dtypes): + for shape in varlen_shapes: + rank = len(shape) + name = get_canonical_name( + TensorType.VARLEN, + rank=rank, + dtype=dtype, + num_records=SMALL_NUM_RECORDS, + partitions=SINGLE_PARTITION, + ) + dim_dist = DimensionDistribution.LARGE_DIM # dim is between 5 to 10 + + generator = cls(tf.SparseTensorSpec(shape=shape, dtype=dtype), dim_dist) + registry.register( + candidate=DataSource( + scenario={name: generator}, + num_records=SMALL_NUM_RECORDS, + partitions=SINGLE_PARTITION, + ), + name=name, + ) + + # Registry data source with all types of tensors + scenario = {} + for key in registry.list(): + data_source = registry.lookup(key) + scenario = {**scenario, **data_source.scenario} + + # small num records for memory leak check + registry.register( + candidate=DataSource( + scenario=scenario, + num_records=SMALL_NUM_RECORDS, + partitions=MULTIPLE_PARTITION, + ), + name=ALL_TYPES_DATA_SOURCE_NAME, + ) + + +def get_data_source_from_registry(name): + registry = get_data_source_registry() + return registry.lookup(name) diff --git a/tests/test_atds_avro/utils/file_writer.py b/tests/test_atds_avro/utils/file_writer.py new file mode 100644 index 000000000..72cabdf56 --- /dev/null +++ b/tests/test_atds_avro/utils/file_writer.py @@ -0,0 +1,183 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""FileWriter""" + +import abc +import os +import shutil +import hashlib +import tempfile +import tensorflow as tf + +TF_IO_BENCHMARK_DATA_CACHE = "TF_IO_BENCHMARK_DATA_CACHE" + + +class FileWriter(metaclass=abc.ABCMeta): + """Base class for file writer. + + FileWriter consumes a DataSource and generates benchmark data as described + in the DataSource. DataSource contains benchmark data metadata such as value + generators, total number of records, and number of partitioned files. + + SubClass must implement `_write_to_path` function to write data under the + given path and follow the spec described in DataSource. + """ + + def __init__(self): + """Create a new FileWriter. + + This must be called by the constructors of subclasses. + """ + self._data_cache_path = os.getenv(TF_IO_BENCHMARK_DATA_CACHE, None) + self._dir_path = self._data_cache_path + + def __enter__(self): + """Enter a context to create dir_path for file generation.""" + if not self._data_cache_path: + self._dir_path = tempfile.mkdtemp() + return self + + def __exit__(self, exc_type, exc_value, traceback): + """Clean up dir_path after exiting the context.""" + if not self._data_cache_path: + shutil.rmtree(self._dir_path, ignore_errors=True) + self._dir_path = None + + def write(self, data_source): + """Generate benchmark data and return the path to the generated files. + + Args: + data_source: A DataSource object that describes the properties of + the benchmark data. + + Returns: + A str path to the generated files. + """ + # Use data source hash code as data source folder name. + data_source_path = data_source.hash_code() + # Use writer hash code as the writer folder path. + writer_dir = self.hash_code() + + if self._data_cache_path is not None: + cached_dataset = None + writer_path = os.path.join( + self._data_cache_path, data_source_path, writer_dir + ) + if os.path.exists(writer_path): + return writer_path + from tests.test_atds_avro.utils.tf_record_writer import ( + TFRecordWriter, + ) + + with TFRecordWriter() as tf_writer: + tf_record_cache_dir = os.path.join( + self._data_cache_path, data_source_path, tf_writer.hash_code() + ) + if not os.path.exists(tf_record_cache_dir): + os.makedirs(tf_record_cache_dir, exist_ok=True) + tf_writer._write_to_path(tf_record_cache_dir, data_source) + parser_fn = tf_writer.create_tf_example_parser_fn(data_source) + pattern = os.path.join(tf_record_cache_dir, f"*.{tf_writer.extension}") + cached_dataset = tf.data.Dataset.list_files(pattern, shuffle=False) + cached_dataset = tf.data.TFRecordDataset(cached_dataset) + cached_dataset = cached_dataset.map(parser_fn) + self._write_to_path_from_cached_data( + writer_path, data_source, cached_dataset + ) + return writer_path + else: + writer_path = os.path.join(self._dir_path, data_source_path, writer_dir) + if not os.path.exists(writer_path): + os.makedirs(writer_path, exist_ok=True) + self._write_to_path(writer_path, data_source) + return writer_path + + @abc.abstractmethod + def _write_to_path(self, dir_path, data_source): + """Generate benchmark data and write the data under the given path. + + Args: + dir_path: A str path to write files to. + data_source: A DataSource object. + + Raises: + NotImplementedError: If subclass does not overload the function. + """ + raise NotImplementedError + + @abc.abstractmethod + def _write_to_path_from_cached_data(self, dir_path, data_source, dataset): + """Write the given dataset to the given path. + + Args: + dir_path: A str path to write dataset to. + data_source: A DataSource object. + dataset: Cached dataset containing data to write. + + Raises: + NotImplementedError: If subclass does not overload the function. + """ + raise NotImplementedError + + @property + @abc.abstractmethod + def extension(self): + """Returns the extension of the filename e.g. tfrecords, avro, etc. + + Raises: + NotImplementedError: If subclass does not overload the function. + """ + raise NotImplementedError + + def hash_code(self): + """Return the hashed code of this file writer. + + The hashed code is used to create the folder that the writer can write + data to. It is useful for benchmark data cache. + + Returns: + The hashed code of the writer in hex str. + """ + m = hashlib.sha256() + # Hash the instance class name by default. Subclass can overload the + # function to support customized hashing logic for its own state. + m.update(self.__class__.__name__.encode()) + return m.hexdigest() + + def _get_filenames_to_num_records(self, data_source): + """Returns a dict mapping filenames to the number of records in that file. + + Args: + data_source: A DataSource describing the data to be written. + + Returns: + A dict mapping filename to number of records in that file. + """ + filenames_to_num_records = {} + partitions = data_source.partitions + record_per_partition = data_source.num_records // partitions + remaining = data_source.num_records % partitions + + partition_length = len(str(partitions)) + for file_index in range(partitions): + # Add leading zero to index_name e.g. 0001 + index_name = str(file_index).zfill(partition_length) + filename = f"part-{index_name}.{self.extension}" + + num_records = record_per_partition + if remaining and file_index < remaining: + num_records = num_records + 1 + filenames_to_num_records[filename] = num_records + return filenames_to_num_records diff --git a/tests/test_atds_avro/utils/generator/__init__.py b/tests/test_atds_avro/utils/generator/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_atds_avro/utils/generator/generator_base.py b/tests/test_atds_avro/utils/generator/generator_base.py new file mode 100644 index 000000000..cdd038ec1 --- /dev/null +++ b/tests/test_atds_avro/utils/generator/generator_base.py @@ -0,0 +1,151 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Generator""" + +import abc +import hashlib +import tensorflow as tf + +from tests.test_atds_avro.utils.hash_util import int_to_bytes + + +class Generator(metaclass=abc.ABCMeta): + """Base class for tensor generator. + + Generator generates tf.Tensor, tf.sparse.SparseTensor, or + tf.RaggedTensor based on the input spec. The spec is one of the + tf.TensorSpec, tf.SparseTensorSpec or tf.RaggedTensorSpec. + """ + + _SUPPORTED_SPEC = (tf.TensorSpec, tf.SparseTensorSpec, tf.RaggedTensorSpec) + + # The value used in hashing None dimension. + _NONE_HASH_VALUE = "None" + _SHAPE_HASH_VALUE = "shape" + _DTYPE_HASH_VALUE = "dtype" + + def __init__(self, spec): + """Create a new Generator. + + This must be called by the constructors of subclasses. + + Args: + spec: A tf.TensorSpec, tf.SparseTensorSpec or tf.RaggedTensorSpec that + describes the tensor generated by this generator. + + Raises: + TypeError: If spec is not one of the tf.TensorSpec, tf.SparseTensorSpec + or tf.RaggedTensorSpec. + """ + if not isinstance(spec, Generator._SUPPORTED_SPEC): + raise TypeError( + "Generator can only accept spec in one of the " + f"{Generator._SUPPORTED_SPEC} types but found {spec}." + ) + self._spec = spec + + @property + def spec(self): + """Return the spec that the generator will follow. + + The spec is one of the tf.TensorSpec, tf.SparseTensorSpec, and + tf.RaggedTensorSpec. + """ + return self._spec + + @abc.abstractmethod + def generate(self): + """Generate tensor value based on the spec of the generator. + + The output tensor must be compatible with the spec. + + When the spec is tf.TensorSpec, tf.SparseTensorSpec, or + tf.RaggedTensorSpec, this function will return tf.Tensor, + tf.sparse.SparseTensor, or tf.RaggedTensor respectively. + + Returns: + tf.Tensor, tf.sparse.SparseTensor, or tf.RaggedTensor + + Raises: + NotImplementedError: If subclass does not overload the function. + """ + raise NotImplementedError + + def get_generator_cls(self): + """Get the generator class which this object represents.""" + return type(self) + + def hash_code(self): + """Return the hasehd code of this Generator in hex str. + + The hashed code helps create the data source cache path on disk. + + Returns: + A hashed hex str of this Generator. + """ + m = hashlib.sha256() + + # Step 1: hash class name + classname = self.get_generator_cls().__name__ + m.update(classname.encode()) + + # Step 2: hash input spec using default hash function in tf.TypeSpec + if isinstance(self.spec, tf.TensorSpec): + m.update(tf.TensorSpec.__name__.encode()) + elif isinstance(self.spec, tf.SparseTensorSpec): + m.update(tf.SparseTensorSpec.__name__.encode()) + elif isinstance(self.spec, tf.RaggedTensorSpec): + m.update(tf.RaggedTensorSpec.__name__.encode()) + m.update(b"ragged_rank") + m.update(int_to_bytes(self.spec.ragged_rank)) + m.update(b"row_splits_dtype") + m.update(int_to_bytes(hash(self.spec.row_splits_dtype))) + m.update(b"flat_values_spec") + if self.spec.flat_values_spec: + m.update(int_to_bytes(hash(self.spec.flat_values_spec))) + else: + raise TypeError( + f"Generator.spec is not one of {Generator._SUPPORTED_SPEC}." + ) + + self._hash_shape(m, self.spec.shape) + self._hash_dtype(m, self.spec.dtype) + + # Step 3" return hashed hex str. + return m.hexdigest() + + def _hash_shape(self, m, shape): + """Hash shape info with a hash algorithm in hashlib. + + Args: + m: A hashlib algorithm e.g. hashlib.sha256(). + shape: A list of python int. + """ + m.update(Generator._SHAPE_HASH_VALUE.encode()) + for dim in shape: + if dim is None: + m.update(Generator._NONE_HASH_VALUE.encode()) + else: + m.update(int_to_bytes(dim)) + + def _hash_dtype(self, m, dtype): + """Hash TensorFlow dtype with a hash algorithm in hashlib. + + Args: + m: A hashlib algorithm e.g. hashlib.sha256(). + dtype: TensorFlow dtype e.g. tf.float32. + """ + m.update(Generator._DTYPE_HASH_VALUE.encode()) + m.update(int_to_bytes(hash(dtype))) diff --git a/tests/test_atds_avro/utils/generator/sparse_tensor_generator.py b/tests/test_atds_avro/utils/generator/sparse_tensor_generator.py new file mode 100644 index 000000000..d693f476c --- /dev/null +++ b/tests/test_atds_avro/utils/generator/sparse_tensor_generator.py @@ -0,0 +1,292 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""SparseTensorGenerator""" + +import enum +import random +import hashlib +import numpy as np +import tensorflow as tf + +from tests.test_atds_avro.utils.generator.generator_base import ( + Generator, +) +from tests.test_atds_avro.utils.hash_util import int_to_bytes + + +class ValueDistribution(enum.Enum): + SINGLE_VALUE = 1 + SMALL_NUM_VALUE = 2 # 5 to 9 elements + LARGE_NUM_VALUE = 3 # 100 to 999 elements + + +_VALUE_DISTRIBUTION_TO_RANGE = { + ValueDistribution.SINGLE_VALUE: (1, 2), + ValueDistribution.SMALL_NUM_VALUE: (5, 10), + ValueDistribution.LARGE_NUM_VALUE: (100, 1000), +} + + +def get_common_value_dist(): + # Assume tensor is one-hot since this is a common use case + return ValueDistribution.SINGLE_VALUE + + +class SparseTensorGeneratorBase(Generator): + """Base of SparseTensorGenerator that generates tf.sparse.SparseTensor.""" + + def __init__(self, spec, num_values): + """Create a new SparseTensorGeneratorBase. + + This must be called by the constructors of subclasses e.g. + IntSparseTensorGenerator, FloatSparseTensorGenerator, etc. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + num_values: A value distribution or an int specifying number of non-zero values in the sparse tensor. + + Raises: + TypeError: If spec is not tf.SparseTensorSpec. + """ + super().__init__(spec) + + if not isinstance(spec, tf.SparseTensorSpec): + raise TypeError( + "Input spec must be a tf.SparseTensorSpec in SparseTensorGenerator " + f"but found {spec}" + ) + + if not isinstance(num_values, (int, ValueDistribution)): + raise TypeError( + f"num_values must be an int or ValueDistribution but found {num_values}" + ) + + self._value_dist = None + if isinstance(num_values, int): + self._num_values = num_values + elif num_values == ValueDistribution.SINGLE_VALUE: + self._num_values = 1 + else: + self._value_dist = num_values + + def _get_num_values(self, shape): + ret = 0 + if self._value_dist == None: + ret = self._num_values + else: + ret = np.random.randint(*_VALUE_DISTRIBUTION_TO_RANGE[self._value_dist]) + return min(ret, np.prod(shape)) + + def _get_shape(self): + # If spec shape is None, generate shape with random rank between 1 and 5 + shape = ( + [None] * np.random.randint(1, 5) + if self.spec.shape == None + else self.spec.shape + ) + # Populate unknown dimensions with random int between 1 and 10 + return [dim if dim != None else np.random.randint(1, 10) for dim in shape] + + def _generate_random_coords(self, num_values, shape): + if num_values == 0: + return np.empty((0, len(shape)), dtype=np.int64) + indices = sorted(random.sample(range(np.prod(shape)), num_values)) + return [self._int_to_coord(idx, shape) for idx in indices] + + def _int_to_coord(self, idx, shape): + """Convert an integer to its corresponding location in a tensor, in row-major order. + For example, in the 2d tensor + [[0, 1, 2] + [3, 4, 5]] + The index 3 will return [1, 0] (i.e. the entry in the second row, first column) + """ + rank = len(shape) + ret = [0] * rank + for dim in range(rank): + val = idx % (shape[rank - dim - 1]) + ret[rank - dim - 1] = val + idx = (idx - val) // shape[rank - dim - 1] + return ret + + def hash_code(self): + hash_code = super().hash_code() + + m = hashlib.sha256() + m.update(hash_code.encode()) + + # Hash input num_values + if self._value_dist: + m.update(ValueDistribution.__name__.encode()) + m.update(self._value_dist.name.encode()) # num_values is enum + else: + m.update(int_to_bytes(self._num_values)) # num_values is constant int. + return m.hexdigest() + + +class IntSparseTensorGenerator(SparseTensorGeneratorBase): + """IntSparseTensorGenerator generates tf.sparse.SparseTensor with dtype in tf.int32 or tf.int64""" + + def __init__(self, spec, num_values): + """Create a new IntSparseTensorGenerator + + With tf.int32 dtype, the generated int range is between -2^31 to 2^31 - 1. + With tf.int64 dtype, the generated int range is between -2^63 to 2^63 - 1. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + num_values: A value distribution or an int specifying number of non-zero values in the sparse tensor. + + Raises: + TypeError: If dtype in spec is not tf.int32 or tf.int64. + """ + super().__init__(spec, num_values) + + if spec.dtype not in [tf.int32, tf.int64]: + raise TypeError( + "IntSparseTensorGenerator can only generate tf.sparse.SparseTensor with " + f"dtype in tf.int32 or tf.int64 but found {spec.dtype}." + ) + + def generate(self): + dtype = self.spec.dtype + info = np.iinfo(dtype.as_numpy_dtype) + shape = self._get_shape() + num_values = self._get_num_values(shape) + vals = np.random.randint( + low=info.min, high=info.max, size=[num_values], dtype=dtype.as_numpy_dtype + ) + coords = self._generate_random_coords(num_values, shape) + return tf.SparseTensor(indices=coords, values=vals, dense_shape=shape) + + +class FloatSparseTensorGenerator(SparseTensorGeneratorBase): + """FloatSparseTensorGenerator generates tf.sparse.SparseTensor with dtype in tf.float32 + or tf.float64.""" + + def __init__(self, spec, num_values): + """Create a new FloatSparseTensorGenerator + + The generated float range is between 0.0 to 1.0. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + num_values: A value distribution or an int specifying number of non-zero values in the sparse tensor. + + Raises: + TypeError: If dtype in spec is not tf.float32 or tf.float64. + """ + super().__init__(spec, num_values) + + if spec.dtype not in [tf.float32, tf.float64]: + raise TypeError( + "FloatSparseTensorGenerator can only generate tf.sparse.SparseTensor with " + f"dtype in tf.float32 or tf.float64 but found {spec.dtype}." + ) + + def generate(self): + shape = self._get_shape() + num_values = self._get_num_values(shape) + vals = np.random.rand(num_values) + if self.spec.dtype == tf.float32: + vals = vals.astype(np.float32) + coords = self._generate_random_coords(num_values, shape) + return tf.SparseTensor(indices=coords, values=vals, dense_shape=shape) + + +class WordSparseTensorGenerator(SparseTensorGeneratorBase): + """WordSparseTensorGenerator generates string tf.SparseTensor with string + length similar to a word.""" + + def __init__(self, spec, num_values, avg_length=5): + """Create a new WordSparseTensorGenerator + + WordSparseTensorGenerator samples word length using Poisson distribution + with lambda equals to avg_length and generates random bytes for each word. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + num_values: A value distribution or an int specifying number of non-zero values in the sparse tensor. + avg_length: An int that represents the average word length. + + Raises: + TypeError: If dtype in spec is not tf.string. + ValueError: If avg_length is not positive. + """ + super().__init__(spec, num_values) + + if spec.dtype is not tf.string: + raise TypeError( + "WordSparseTensorGenerator can only generate tf.sparse.SparseTensor with " + f"dtype in tf.string but found {spec.dtype}." + ) + + if avg_length < 1: + raise ValueError( + "WordSparseTensorGenerator must have positive avg_length" + f" but found {avg_length}." + ) + self._avg_length = avg_length + + def generate(self): + # Use Poisson distribution to sample the length of byte strings. + # The avg_length equals to the lambda in Poisson distribution. + shape = self._get_shape() + num_values = self._get_num_values(shape) + lengths = np.random.poisson(self._avg_length, size=num_values) + + to_string = lambda length: np.random.bytes(length) + vfunc = np.vectorize(to_string) + vals = vfunc(lengths) + coords = self._generate_random_coords(num_values, shape) + return tf.SparseTensor(indices=coords, values=vals, dense_shape=shape) + + def hash_code(self): + hash_code = super().hash_code() + m = hashlib.sha256() + m.update(hash_code.encode()) + m.update(int_to_bytes(self._avg_length)) + return m.hexdigest() + + +class BoolSparseTensorGenerator(SparseTensorGeneratorBase): + """BoolSparseTensorGenerator generates tf.sparse.SparseTensor with dtype in tf.bool.""" + + def __init__(self, spec, num_values): + """Create a new BoolSparseTensorGenerator. + + The generated bool value has equal true and false possibility. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + num_values: A value distribution or an int specifying number of non-zero values in the sparse tensor. + + Raises: + TypeError: If dtype in spec is not tf.bool. + """ + super().__init__(spec, num_values) + + if spec.dtype is not tf.bool: + raise TypeError( + "BoolSparseTensorGenerator can only generate tf.sparse.SparseTensor with " + f"dtype in tf.bool but found {spec.dtype}." + ) + + def generate(self): + shape = self._get_shape() + # np.random.rand generates values from 0 to 1 using Uniform distribution + num_values = self._get_num_values(shape) + vals = np.random.rand(num_values) > 0.5 + coords = self._generate_random_coords(num_values, shape) + return tf.SparseTensor(indices=coords, values=vals, dense_shape=shape) diff --git a/tests/test_atds_avro/utils/generator/sparse_util.py b/tests/test_atds_avro/utils/generator/sparse_util.py new file mode 100644 index 000000000..dc41947cb --- /dev/null +++ b/tests/test_atds_avro/utils/generator/sparse_util.py @@ -0,0 +1,28 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +def coord_to_int(coord, shape): + """Convert a location in a tensor to its unique index, in row-major order. + For example, in the 2d tensor + [[0, 1, 2] + [3, 4, 5]] + The location [1, 0] (i.e. the entry in the second row, first column) will return 3. + """ + ret = 0 + rank = len(shape) + for dim in range(rank): + ret = ret * shape[dim] + coord[dim] + return ret diff --git a/tests/test_atds_avro/utils/generator/tensor_generator.py b/tests/test_atds_avro/utils/generator/tensor_generator.py new file mode 100644 index 000000000..833f75efe --- /dev/null +++ b/tests/test_atds_avro/utils/generator/tensor_generator.py @@ -0,0 +1,197 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""TensorGenerator""" + +import hashlib +import numpy as np +import tensorflow as tf + +from tests.test_atds_avro.utils.generator.generator_base import ( + Generator, +) +from tests.test_atds_avro.utils.hash_util import int_to_bytes + + +class TensorGeneratorBase(Generator): + """Base of TensorGenerator that generates tf.Tensor.""" + + def __init__(self, spec): + """Create a new TensorGeneratorBase. + + This must be called by the constructors of subclasses e.g. + IntTensorGenerator, FloatTensorGenerator, etc. + + Args: + spec: A tf.TensorSpec that describes the output tensor. + + Raises: + TypeError: If spec is not tf.TensorSpec. + ValueError: If shape in spec is not fully defined. + """ + super().__init__(spec) + + if not isinstance(spec, tf.TensorSpec): + raise TypeError( + "Input spec must be a tf.TensorSpec in TensorGenerator " + f"but found {spec}" + ) + + spec.shape.assert_is_fully_defined() + + +class IntTensorGenerator(TensorGeneratorBase): + """IntTensorGenerator generates tf.Tensor with dtype in tf.int32 or tf.int64""" + + def __init__(self, spec): + """Create a new IntTensorGenerator + + With tf.int32 dtype, the generated int range is between -2^31 to 2^31 - 1. + With tf.int64 dtype, the generated int range is between -2^63 to 2^63 - 1. + + Args: + spec: A tf.TensorSpec that describes the output tensor. + + Raises: + TypeError: If dtype in spec is not tf.int32 or tf.int64. + """ + super().__init__(spec) + + if spec.dtype not in [tf.int32, tf.int64]: + raise TypeError( + "IntTensorGenerator can only generate tf.Tensor with " + f"dtype in tf.int32 or tf.int64 but found {spec.dtype}." + ) + + def generate(self): + dtype = self.spec.dtype + info = np.iinfo(dtype.as_numpy_dtype) + shape = self.spec.shape.as_list() + values = np.random.randint( + low=info.min, high=info.max, size=shape, dtype=dtype.as_numpy_dtype + ) + return tf.convert_to_tensor(values, dtype=dtype, name=self.spec.name) + + +class FloatTensorGenerator(TensorGeneratorBase): + """FloatTensorGenerator generates tf.Tensor with dtype in tf.float32 + or tf.float64.""" + + def __init__(self, spec): + """Create a new FloatTensorGenerator + + The generated float range is between 0.0 to 1.0. + + Args: + spec: A tf.TensorSpec that describes the output tensor. + + Raises: + TypeError: If dtype in spec is not tf.float32 or tf.float64. + """ + super().__init__(spec) + + if spec.dtype not in [tf.float32, tf.float64]: + raise TypeError( + "FloatTensorGenerator can only generate tf.Tensor with " + f"dtype in tf.float32 or tf.float64 but found {spec.dtype}." + ) + + def generate(self): + shape = self.spec.shape.as_list() + values = np.random.rand(*shape) + return tf.convert_to_tensor(values, dtype=self.spec.dtype, name=self.spec.name) + + +class WordTensorGenerator(TensorGeneratorBase): + """WordTensorGenerator generates string tf.Tensor with string + length similar to a word.""" + + def __init__(self, spec, avg_length=5): + """Create a new WordTensorGenerator + + WordTensorGenerator samples word length using Poisson distribution + with lambda equals to avg_length and generates random bytes for each word. + + Args: + spec: A tf.TensorSpec that describes the output tensor. + avg_length: An int that represents the average word length. + + Raises: + TypeError: If dtype in spec is not tf.string. + ValueError: If avg_length is not positive. + """ + super().__init__(spec) + + if spec.dtype is not tf.string: + raise TypeError( + "WordTensorGenerator can only generate tf.Tensor with " + f"dtype in tf.string but found {spec.dtype}." + ) + + if avg_length < 1: + raise ValueError( + "WordTensorGenerator must have positive avg_length" + f" but found {avg_length}." + ) + self._avg_length = avg_length + + def generate(self): + # Use Poisson distribution to sample the length of byte strings. + # The avg_length equals to the lambda in Poisson distribution. + shape = self.spec.shape.as_list() + lengths = np.random.poisson(self._avg_length, size=shape) + + to_string = lambda length: np.random.bytes(length) + vfunc = np.vectorize(to_string) + values = vfunc(lengths) + + return tf.convert_to_tensor(values, dtype=tf.string, name=self.spec.name) + + def hash_code(self): + """Return the hashed code of this Generator in hex str.""" + hash_code = super().hash_code() + + m = hashlib.sha256() + m.update(hash_code.encode()) + m.update(int_to_bytes(self._avg_length)) + return m.hexdigest() + + +class BoolTensorGenerator(TensorGeneratorBase): + """BoolTensorGenerator generates tf.Tensor with dtype in tf.bool.""" + + def __init__(self, spec): + """Create a new BoolTensorGenerator. + + The generated bool value has equal true and false possibility. + + Args: + spec: A tf.TensorSpec that describes the output tensor. + + Raises: + TypeError: If dtype in spec is not tf.bool. + """ + super().__init__(spec) + + if spec.dtype is not tf.bool: + raise TypeError( + "BoolTensorGenerator can only generate tf.Tensor with " + f"dtype in tf.bool but found {spec.dtype}." + ) + + def generate(self): + shape = self.spec.shape.as_list() + # np.random.rand generates values from 0 to 1 using Uniform distribution + values = np.random.rand(*shape) > 0.5 + return tf.convert_to_tensor(values, dtype=tf.bool, name=self.spec.name) diff --git a/tests/test_atds_avro/utils/generator/varlen_tensor_generator.py b/tests/test_atds_avro/utils/generator/varlen_tensor_generator.py new file mode 100644 index 000000000..4dbe6293b --- /dev/null +++ b/tests/test_atds_avro/utils/generator/varlen_tensor_generator.py @@ -0,0 +1,263 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""VarLenTensorGenerator""" + +import enum +import hashlib +import numpy as np +import tensorflow as tf + +from tests.test_atds_avro.utils.generator.generator_base import ( + Generator, +) +from tests.test_atds_avro.utils.hash_util import int_to_bytes + + +class DimensionDistribution(enum.Enum): + ONE_DIM = 1 # dimension size 1 + TWO_DIM = 2 # dimension size 2 + LARGE_DIM = 3 # dimension size from 5 to 10 + + +DIM_DISTRIBUTION_TO_RANGE = {DimensionDistribution.LARGE_DIM: (5, 10)} + + +class VarLenTensorGeneratorBase(Generator): + """Base of VarLenTensorGeneratorBase that generates tf.sparse.SparseTensor.""" + + def __init__(self, spec, dim_dist): + """Create a new VarLenTensorGeneratorBase. + + This must be called by the constructors of subclasses e.g. + IntVarLenTensorGeneratorBase, FloatVarLenTensorGeneratorBase, etc. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + dim_dist: Distribution of dimension sizes. + + Raises: + TypeError: If spec is not tf.SparseTensorSpec. + """ + super().__init__(spec) + if not isinstance(spec, tf.SparseTensorSpec): + raise TypeError( + "Input spec must be a tf.SparseTensorSpec in VarLenTensorGenerator " + f"but found {spec}" + ) + + if not isinstance(dim_dist, DimensionDistribution): + raise TypeError( + f"dim_dist must be a DimensionDistribution but found {dim_dist}" + ) + + if self.spec.shape.rank is None: + raise ValueError(f"Input spec must have known rank") + + self._dim_dist = dim_dist + + def _get_dim(self): + if self._dim_dist == DimensionDistribution.ONE_DIM: + return 1 + elif self._dim_dist == DimensionDistribution.TWO_DIM: + return 2 + elif self._dim_dist == DimensionDistribution.LARGE_DIM: + return np.random.randint(*DIM_DISTRIBUTION_TO_RANGE[self._dim_dist]) + else: + raise ValueError( + f"Found unsupported dimension distribution {self._dim_dist}" + ) + + def _get_shape(self): + return [ + dim if dim is not None else self._get_dim() + for dim in self.spec.shape.as_list() + ] + + def _get_idx(self, depth, shape, current_idx, ret): + cur_dim = shape[depth] + # Generate full list of idx, e.g. for a [2, 3] tensor: + # [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]] + for i in range(cur_dim): + current_idx[depth] = i + if depth == len(shape) - 1: + ret.append(current_idx.copy()) + else: + self._get_idx(depth + 1, shape, current_idx, ret) + + def hash_code(self): + hash_code = super().hash_code() + + m = hashlib.sha256() + m.update(hash_code.encode()) + m.update(DimensionDistribution.__name__.encode()) + m.update(self._dim_dist.name.encode()) + return m.hexdigest() + + +class IntVarLenTensorGenerator(VarLenTensorGeneratorBase): + """IntVarLenTensorGenerator generates tf.sparse.SparseTensor with dtype in tf.int32 or tf.int64""" + + def __init__(self, spec, dim_dist=DimensionDistribution.ONE_DIM): + """Create a new IntVarLenTensorGenerator + + With tf.int32 dtype, the generated int range is between -2^31 to 2^31 - 1. + With tf.int64 dtype, the generated int range is between -2^63 to 2^63 - 1. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + dim_dist: Distribution of dimension sizes. + + Raises: + TypeError: If dtype in spec is not tf.int32 or tf.int64. + """ + super().__init__(spec, dim_dist) + + if spec.dtype not in [tf.int32, tf.int64]: + raise TypeError( + "IntVarLenTensorGenerator can only generate tf.sparse.SparseTensor with " + f"dtype in tf.int32 or tf.int64 but found {spec.dtype}." + ) + + def generate(self): + dtype = self.spec.dtype + info = np.iinfo(dtype.as_numpy_dtype) + shape = self._get_shape() + idxs = [] + self._get_idx(0, shape, [0] * len(shape), idxs) + vals = np.random.randint( + low=info.min, high=info.max, size=len(idxs), dtype=dtype.as_numpy_dtype + ) + return tf.SparseTensor(indices=idxs, values=vals, dense_shape=shape) + + +class FloatVarLenTensorGenerator(VarLenTensorGeneratorBase): + """FloatVarLenTensorGenerator generates tf.sparse.SparseTensor with dtype in tf.float32 + or tf.float64.""" + + def __init__(self, spec, dim_dist=DimensionDistribution.ONE_DIM): + """Create a new FloatVarLenTensorGenerator + + The generated float range is between 0.0 to 1.0. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + dim_dist: Distribution of dimension sizes. + + Raises: + TypeError: If dtype in spec is not tf.float32 or tf.float64. + """ + super().__init__(spec, dim_dist) + + if spec.dtype not in [tf.float32, tf.float64]: + raise TypeError( + "FloatVarLenTensorGenerator can only generate tf.sparse.SparseTensor with " + f"dtype in tf.float32 or tf.float64 but found {spec.dtype}." + ) + + def generate(self): + shape = self._get_shape() + idxs = [] + self._get_idx(0, shape, [0] * len(shape), idxs) + vals = np.random.rand(len(idxs)) + if self.spec.dtype == tf.float32: + vals = vals.astype(np.float32) + return tf.SparseTensor(indices=idxs, values=vals, dense_shape=shape) + + +class WordVarLenTensorGenerator(VarLenTensorGeneratorBase): + """WordVarLenTensorGenerator generates string tf.SparseTensor with string + length similar to a word.""" + + def __init__(self, spec, dim_dist=DimensionDistribution.ONE_DIM, avg_length=5): + """Create a new WordVarLenTensorGenerator + + WordVarLenTensorGenerator samples word length using Poisson distribution + with lambda equals to avg_length and generates random bytes for each word. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor. + dim_dist: Distribution of dimension sizes. + avg_length: An int that represents the average word length. + + Raises: + TypeError: If dtype in spec is not tf.string. + ValueError: If avg_length is not positive. + """ + super().__init__(spec, dim_dist) + + if spec.dtype is not tf.string: + raise TypeError( + "WordVarLenTensorGenerator can only generate tf.sparse.SparseTensor with " + f"dtype in tf.string but found {spec.dtype}." + ) + + if avg_length < 1: + raise ValueError( + "WordVarLenTensorGenerator must have positive avg_length" + f" but found {avg_length}." + ) + self._avg_length = avg_length + + def generate(self): + # Use Poisson distribution to sample the length of byte strings. + # The avg_length equals to the lambda in Poisson distribution. + shape = self._get_shape() + idxs = [] + self._get_idx(0, shape, [0] * len(shape), idxs) + lengths = np.random.poisson(self._avg_length, size=len(idxs)) + + to_string = lambda length: np.random.bytes(length) + vfunc = np.vectorize(to_string) + vals = vfunc(lengths) + return tf.SparseTensor(indices=idxs, values=vals, dense_shape=shape) + + def hash_code(self): + hash_code = super().hash_code() + m = hashlib.sha256() + m.update(hash_code.encode()) + m.update(int_to_bytes(self._avg_length)) + return m.hexdigest() + + +class BoolVarLenTensorGenerator(VarLenTensorGeneratorBase): + """BoolVarLenTensorGenerator generates tf.sparse.SparseTensor with dtype in tf.bool.""" + + def __init__(self, spec, dim_dist=DimensionDistribution.ONE_DIM): + """Create a new BoolVarLenTensorGenerator. + + The generated bool value has equal true and false possibility. + + Args: + spec: A tf.SparseTensorSpec that describes the output tensor.\ + dim_dist: Distribution of dimension sizes. + + Raises: + TypeError: If dtype in spec is not tf.bool. + """ + super().__init__(spec, dim_dist) + + if spec.dtype is not tf.bool: + raise TypeError( + "BoolVarLenTensorGenerator can only generate tf.sparse.SparseTensor with " + f"dtype in tf.bool but found {spec.dtype}." + ) + + def generate(self): + shape = self._get_shape() + # np.random.rand generates values from 0 to 1 using Uniform distribution + idxs = [] + self._get_idx(0, shape, [0] * len(shape), idxs) + vals = np.random.rand(len(idxs)) > 0.5 + return tf.SparseTensor(indices=idxs, values=vals, dense_shape=shape) diff --git a/tests/test_atds_avro/utils/hash_util.py b/tests/test_atds_avro/utils/hash_util.py new file mode 100644 index 000000000..21ec4742f --- /dev/null +++ b/tests/test_atds_avro/utils/hash_util.py @@ -0,0 +1,22 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utility functions for hashing""" + + +def int_to_bytes(x): + # Add one bit because a signed N-bit int can only represent up to 2^(N-1) - 1 + # (instead of an unsigned N-bit int which can represent up to 2^N - 1). + # For example, 128 requires 9 bits (therefore two bytes) in twos complement. + return x.to_bytes(x.bit_length() // 8 + 1, byteorder="little", signed=True) diff --git a/tests/test_atds_avro/utils/stat_t_test_cli.py b/tests/test_atds_avro/utils/stat_t_test_cli.py new file mode 100644 index 000000000..be5c1f976 --- /dev/null +++ b/tests/test_atds_avro/utils/stat_t_test_cli.py @@ -0,0 +1,352 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"Command line tool to analyze benchmark result with Welch's t-test." + +import argparse +import collections +import logging +import json +import os +from operator import attrgetter + +import numpy as np +from scipy.stats import t + +# Common field names used in pytest-benchmark JSON report. +BENCHMARKS = "benchmarks" +GROUP = "group" +NAME = "name" +STATS = "stats" +MEAN = "mean" +STDDEV = "stddev" +ROUNDS = "rounds" + + +class BenchmarkName(collections.namedtuple("BenchmarkName", ["group", "name"])): + """The name and group of a pytest-benchmark test. + + Fields: + group: The name of the pytest-benchmark group that this test belongs to. + A benchmark group contains one or more benchmark tests. + name: The name of the benchmark test. + """ + + pass + + +class BenchmarkResult(collections.namedtuple("BenchmarkResult", ["label", "result"])): + """Benchmark result loaded from pytest-benchmark JSON report. + + Fields: + label: The label of the benchmark run. + result: A benchmark dict generated by the pytest-benchmark JSON report. + Key is a BenchmarkName. Value is a stat dict. Stat dict contains + statistics such as min, max, mean, std, etc + """ + + pass + + +class TTestResult( + collections.namedtuple( + "TTestResult", ["p_value", "t_stat", "lower_bound", "upper_bound", "mean_delta"] + ) +): + """Welch's t-test result and one sided confidence interval. + + Fields: + p_value: The p value of the Welch's t-test. + t_stat: The T statistic of the Welch's t-test. + lower_bound: The lower bound of the one sided confidence interval. + upper_bound: The upper bound of the one sided confidence interval. + mean_delta: The difference in sample mean. + """ + + pass + + +class AlphaAction(argparse.Action): + """Parser action for confidence level alpha to validate its value.""" + + def __call__(self, parser, namespace, values, option_string=None): + if values <= 0.0 or values >= 1.0: + parser.error(f"The alpha range of '{option_string}' should be (0.0, 1.0).") + + setattr(namespace, self.dest, values) + + +def load_benchmark_result(path): + filename = os.path.basename(path) + label = os.path.splitext(filename)[0] # Remove file extension + + with open(path) as f: + report = json.load(f) + + result = {} + for benchmark_test in report[BENCHMARKS]: + group_name = benchmark_test[GROUP] + test_name = benchmark_test[NAME] + + benchmark_name = BenchmarkName(group=group_name, name=test_name) + result[benchmark_name] = benchmark_test[STATS] + + return BenchmarkResult(label=label, result=result) + + +def run_welchs_ttest(stat1, stat2, alpha, faster): + """Run one tailed Welch's t-test to verify if stat1 is faster/slower than stat2 + + Please refer wiki for more details about Welch's t-test. + https://en.wikipedia.org/wiki/Welch%27s_t-test + + Please check scipy for the t-test implementation details. + https://github.com/scipy/scipy/blob/v1.7.1/scipy/stats/stats.py#L5712-L5833 + + The confidence interval is computed with one sided approach. For more details, see + https://stats.stackexchange.com/questions/257526/can-one-sided-confidence-intervals-have-95-coverage + + Args: + stat1: The first statistic dict collected by pytest-benchmark. + stat2: The second statistic dict collected by pytest-benchmark. + alpha: The confidence level. + faster: True to test if stat1 is faster than stat2. False to test + if stat1 is slower than stat2. + + Returns: + A TTestResult + """ + m1 = stat1[MEAN] + m2 = stat2[MEAN] + + s1 = stat1[STDDEV] + s2 = stat2[STDDEV] + + n1 = stat1[ROUNDS] + n2 = stat2[ROUNDS] + + df1 = n1 - 1 # degree of freedom of stat1 + df2 = n2 - 1 # degree of freedom of stat2 + + sample_v1 = s1 ** 2 / n1 # biased estimated sample variance of stat1 + sample_v2 = s2 ** 2 / n2 # biased estimated sample variance of stat2 + + biased_variance = np.sqrt(sample_v1 + sample_v2) + # degree of freedom + df = (sample_v1 + sample_v2) ** 2 / ( + sample_v1 ** 2 / (df1) + sample_v2 ** 2 / (df2) + ) + + mean_delta = m1 - m2 + t_stat = mean_delta / biased_variance + + if faster: + # Null hypothesis is stat1 >= stat2. + # Alternative hypothesis is stat1 < stat2. + p_value = t.cdf(t_stat, df) + + # Compute one sided confidence interval (-inf, x) + upper_bound = mean_delta + t.ppf(1.0 - alpha, df) * biased_variance + upper_bound = format(upper_bound, ".5f") + lower_bound = "-inf" + else: + # Null hypothesis is stat1 <= stat2. + # Alternative hypothesis is stat1 > stat2. + p_value = 1.0 - t.cdf(t_stat, df) + + # Compute one sided confidence interval (x, inf) + upper_bound = "inf" + lower_bound = mean_delta + t.ppf(alpha, df) * biased_variance + lower_bound = format(lower_bound, ".5f") + + return TTestResult( + p_value=p_value, + t_stat=t_stat, + lower_bound=lower_bound, + upper_bound=upper_bound, + mean_delta=format(mean_delta, ".5f"), + ) + + +def create_result_messages(benchmark_results, alpha): + sorted_benchmarks = sorted( + list(benchmark_results.keys()), key=attrgetter("group", "name") + ) + + template = "\t{benchmark:55}{mean_delta:25}{confidence_interval:40}" + + confidence = (1.0 - alpha) * 100 + results = [ + template.format( + benchmark="Benchmark test (group::name)", + mean_delta="Mean delta in second", + confidence_interval=f"{confidence}% confidence interval of mean delta", + ) + ] + for benchmark in sorted_benchmarks: + ttest_result = benchmark_results[benchmark] + results.append( + template.format( + benchmark=f"{benchmark.group}::{benchmark.name}", + mean_delta=f"{ttest_result.mean_delta}", + confidence_interval=f"({ttest_result.lower_bound}, " + f"{ttest_result.upper_bound})", + ) + ) + + return "\n".join(results) + + +def log_benchmark_not_in_both_report(diff, in_result, not_in_result): + sorted_diff = sorted(list(diff), key=attrgetter("group", "name")) + sorted_benchmarks = "\n".join( + [f"\t{benchmark.group}::{benchmark.name}" for benchmark in sorted_diff] + ) + + message = ( + f"Found following benchmarks in {in_result.label} " + f"but not in {not_in_result.label}.\n {sorted_benchmarks}" + ) + logging.warning(message) + + +def log_no_overlapped_benchmark_result(first_result, second_result): + message = ( + f"Benchmark results in {first_result.label} " + f"and {second_result.label} have no intersection." + ) + logging.warning(message) + + +def log_not_significant_benchmark_result(benchmarks, label1, label2, order, alpha): + result_messages = create_result_messages(benchmarks, alpha) + message = ( + f"The following benchmark results does NOT show that " + f"{label1} is statistically significant {order} than " + f"{label2}.\n {result_messages}" + ) + logging.info(message) + + +def log_significant_benchmark_result(benchmarks, label1, label2, order, alpha): + result_messages = create_result_messages(benchmarks, alpha) + message = ( + f"The following benchmark results show that {label1} is " + f"statistically significant {order} than {label2}." + f"\n{result_messages}" + ) + logging.info(message) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Run Welch's t-test on benchmark result." + ) + parser.add_argument( + "first_report", + metavar="r1", + type=str, + help="Path to the first pytest-benchmark JSON report.", + ) + parser.add_argument( + "second_report", + metavar="r2", + type=str, + help="Path to the second pytest-benchmark JSON report.", + ) + parser.add_argument( + "-a", + "--alpha", + metavar="alpha", + type=float, + default=0.05, + action=AlphaAction, + help="The confidence level in t-test.", + ) + + slower_message = ( + "Set this flag to test if the first result is slower " + "than the second. Otherwise, the tool will test if the " + "first result is faster than the second." + ) + parser.add_argument("--slower", action="store_true", help=slower_message) + + return parser.parse_args() + + +def main(): + args = parse_args() + + first_result = load_benchmark_result(args.first_report) + second_result = load_benchmark_result(args.second_report) + + benchmark_in_first = set(first_result.result.keys()) + benchmark_in_second = set(second_result.result.keys()) + + first_but_not_second = benchmark_in_first.difference(benchmark_in_second) + second_but_not_first = benchmark_in_second.difference(benchmark_in_first) + in_both = benchmark_in_first.intersection(benchmark_in_second) + + if first_but_not_second: + log_benchmark_not_in_both_report( + diff=first_but_not_second, + in_result=first_result, + not_in_result=second_result, + ) + + if second_but_not_first: + log_benchmark_not_in_both_report( + diff=second_but_not_first, + in_result=second_result, + not_in_result=first_result, + ) + + if not in_both: + log_no_overlapped_benchmark_result(first_result, second_result) + return + + is_faster = False if args.slower else True + alpha = args.alpha + + significant = {} + not_significant = {} + for benchmark_test in in_both: + ttest_result = run_welchs_ttest( + stat1=first_result.result[benchmark_test], + stat2=second_result.result[benchmark_test], + alpha=alpha, + faster=is_faster, + ) + + if ttest_result.p_value < alpha: + significant[benchmark_test] = ttest_result + else: + not_significant[benchmark_test] = ttest_result + + order = "faster" if is_faster else "slower" + + if not_significant: + log_not_significant_benchmark_result( + not_significant, first_result.label, second_result.label, order, alpha + ) + + if significant: + log_significant_benchmark_result( + significant, first_result.label, second_result.label, order, alpha + ) + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + main() diff --git a/tests/test_atds_avro/utils/tf_record_writer.py b/tests/test_atds_avro/utils/tf_record_writer.py new file mode 100644 index 000000000..f830b0555 --- /dev/null +++ b/tests/test_atds_avro/utils/tf_record_writer.py @@ -0,0 +1,305 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""TFRecordWriter""" + +import os +from tests.test_atds_avro.utils.generator.tensor_generator import ( + TensorGeneratorBase, +) +from tests.test_atds_avro.utils.generator.sparse_tensor_generator import ( + SparseTensorGeneratorBase, +) +from tests.test_atds_avro.utils.generator.varlen_tensor_generator import ( + VarLenTensorGeneratorBase, +) +import numpy as np +import tensorflow as tf + +from tests.test_atds_avro.utils.file_writer import FileWriter + + +class TFRecordWriter(FileWriter): + """File writer for TFRecord dataset. + + TFRecordWriter serializes tensors in tf.Example schema and write them + into files in TFRecord format. The written file can be loaded with + tf.data.TFRecordDataset. + """ + + # The TFRecord file extension. + _TFRECORD_EXTENSION = "tfrecords" + + # TFRecord Dataset reads TFRecord data serialized in tf.Example schema. + # tf.Example only supported three dtypes i.e. int64, float32, and bytes. + # See https://www.tensorflow.org/tutorials/load_data/tfrecord#data_types_for_tftrainexample + # The lists below are used to map tensor dtype into supported dtype in + # tf.Example. For example, tf.bool will be mapped to int64. + _INT64_LIST_DTYPE = [tf.bool, tf.int32, tf.int64, tf.uint32, tf.uint64] + _FLOAT_LIST_DTYPE = [tf.float32, tf.float64] + _BYTES_LIST_DTYPE = [tf.string] + + # Sparse tensor is composed of many 1D dense tensors in tf.Example. + # The suffix is used to name these dense tensors given the sparse + # tensor name. For example, a 2D sparse tensor 'feature' was represented + # with three dense tensors with name 'feature/indices0', 'feature/indices1', + # and 'feature/values'. + _SPARSE_INDICES_SUFFIX = "/indices" + _SPARSE_VALUES_SUFFIX = "/values" + + def __init__(self): + """Create a new TFRecordWriter""" + super().__init__() + + @property + def extension(self): + """Return the file extension of the written files.""" + return TFRecordWriter._TFRECORD_EXTENSION + + def _write_to_path(self, dir_path, data_source): + """Generate data based on the data_source and write + files under the given path.""" + scenario = data_source.scenario + + filenames_to_num_records = self._get_filenames_to_num_records(data_source) + for filename in sorted(filenames_to_num_records): + file_path = os.path.join(dir_path, filename) + num_records = filenames_to_num_records[filename] + with tf.io.TFRecordWriter(file_path) as file_writer: + for _ in range(num_records): + features = {} + + for feature_name in scenario: + generator = scenario[feature_name] + tensor = generator.generate() + self._add_tensor_to_features( + generator, feature_name, tensor, features + ) + record_bytes = tf.train.Example( + features=tf.train.Features(feature=features) + ).SerializeToString() + file_writer.write(record_bytes) + + def _write_to_path_from_cached_data(self, dir_path, data_source, dataset): + if os.path.exists(dir_path): + return dir_path + os.makedirs(dir_path) + + scenario = data_source.scenario + filenames_to_num_records = self._get_filenames_to_num_records(data_source) + iterator = iter(dataset) + for filename in sorted(filenames_to_num_records): + file_path = os.path.join(dir_path, filename) + num_records = filenames_to_num_records[filename] + + with tf.io.TFRecordWriter(file_path) as file_writer: + for _ in range(num_records): + features = {} + record = iterator.get_next() + + for feature_name in scenario: + generator = scenario[feature_name] + feature = record[feature_name] + self._add_tensor_to_features( + generator, feature_name, feature, features + ) + record_bytes = tf.train.Example( + features=tf.train.Features(feature=features) + ).SerializeToString() + file_writer.write(record_bytes) + + def _add_dense_tensor_to_features(self, name, value, dtype, features): + """Wrap value np.array into tf.train.Feature and add it into features.""" + if np.isscalar(value): + value = [value] # Convert scalar into a list. + else: + value = value.flatten() + + example_dtype = self._map_tensor_dtype_to_example_dtype(dtype) + if example_dtype is tf.int64: + features[name] = tf.train.Feature( + int64_list=tf.train.Int64List(value=value) + ) + elif example_dtype is tf.float32: + features[name] = tf.train.Feature( + float_list=tf.train.FloatList(value=value) + ) + elif example_dtype is tf.string: + features[name] = tf.train.Feature( + bytes_list=tf.train.BytesList(value=value) + ) + else: + raise TypeError(f"Dtype {dtype} is not supported in tf.Example.") + + def _map_tensor_dtype_to_example_dtype(self, dtype): + """As tf.Example only supports tf.float32, tf.int64, and tf.string dtype. + This function maps tensor dtype into the dtype supported by tf.Example.""" + if dtype in TFRecordWriter._INT64_LIST_DTYPE: + return tf.int64 + elif dtype in TFRecordWriter._FLOAT_LIST_DTYPE: + return tf.float32 + elif dtype in TFRecordWriter._BYTES_LIST_DTYPE: + return tf.string + else: + raise TypeError(f"Dtype {dtype} is not supported in tf.Example.") + + def _add_sparse_tensor_to_features(self, name, tensor, features): + indices_name = name + TFRecordWriter._SPARSE_INDICES_SUFFIX + values_name = name + TFRecordWriter._SPARSE_VALUES_SUFFIX + + rank = len(tensor.shape.as_list()) + indices = tensor.indices.numpy() # indices tensor must be a 2D array + # Split indices array along the second dimension so that the split arrays + # contain the indices for separate dimension. For example, + # indices = [[0, 1], [2, 3], [4, 5]] => + # indices_at_dim0 = [[0], [2], [4]] and + # indices_at_dim1 = [[1], [3], [5]]. + split_indices = np.split(indices, rank, axis=1) + for dim in range(rank): + indices_name_at_dim = indices_name + str(dim) + self._add_dense_tensor_to_features( + name=indices_name_at_dim, + value=split_indices[dim], + dtype=tensor.indices.dtype, + features=features, + ) + + self._add_dense_tensor_to_features( + name=values_name, + value=tensor.values.numpy(), + dtype=tensor.values.dtype, + features=features, + ) + + def _add_tensor_to_features(self, generator, feature_name, tensor, features): + spec = generator.spec + if isinstance(spec, tf.TensorSpec): + self._add_dense_tensor_to_features( + feature_name, tensor.numpy(), tensor.dtype, features + ) + elif isinstance(spec, tf.SparseTensorSpec): + if ( + issubclass(generator.get_generator_cls(), VarLenTensorGeneratorBase) + and spec.shape.rank == 1 + ): + self._add_dense_tensor_to_features( + feature_name, + tf.sparse.to_dense(tensor).numpy(), + tensor.dtype, + features, + ) + elif ( + issubclass(generator.get_generator_cls(), SparseTensorGeneratorBase) + and spec.shape.is_fully_defined() + ): + self._add_sparse_tensor_to_features(feature_name, tensor, features) + else: + raise ValueError( + "SparseTensorSpec shape must be either a 1D varlen tensor from VarLenTensorGenerator " + f"or fully defined sparse tensor from SparseTensorGenerator. Found {spec}" + ) + else: + raise TypeError(f"Spec {spec} is not supported in TFRecordWriter") + + def create_tf_example_parser_fn(self, data_source, with_batch=False): + """Create tf.Example parser function based on the data_source. + + The parser function can be used for parsing tf.Example. + Example usage: + + ``` python + data_source = DataSource(...) + with TFRecordWriter() as writer: + dir_path = writer.write(data_source) + parser_fn = writer.create_tf_example_parser_fn(data_source) + + pattern = os.path.join(dir_path, f"*.{writer.extension}") + dataset = tf.data.Dataset.list_files(pattern) + dataset = tf.data.TFRecordDataset(dataset) + dataset = dataset.map(parser_fn) + ``` + + Args: + data_source: A DataSource object describe the format of the data. + with_batch: True if the parser function should take a number of + serialized tf.Example proto. Default is false. + + Returns: + A callable function that takes serialized tf.Example proto as input, + and returns the parsed tensor dict. + """ + scenario = data_source.scenario + feature_description = { + name: self._build_tf_example_parsing_config(name, scenario[name]) + for name in scenario + } + + if with_batch: + + def _batch_examples_parser_fn(example_proto): + return tf.io.parse_example(example_proto, feature_description) + + return _batch_examples_parser_fn + + def _single_example_parser_fn(example_proto): + return tf.io.parse_single_example(example_proto, feature_description) + + return _single_example_parser_fn + + def _build_tf_example_parsing_config(self, name, generator): + """Build tf.Example parsing config + + Args: + name: A str feature name. + generator: Generator for this tensor. + + Returns: + tf.io.FixedLenFeature if generator is TensorGenerator. + tf.io.SparseFeature if generator is SparseTensorGenerator. + tf.io.VarLenFeature if generator is VarlenTensorGenerator. + + Raises: + TypeError: if generator is not TensorGenerator, SparseTensorGenerator, or VarlenTensorGenerator. + """ + spec = generator.spec + example_dtype = self._map_tensor_dtype_to_example_dtype(spec.dtype) + if isinstance(spec, tf.TensorSpec): + return tf.io.FixedLenFeature(shape=spec.shape, dtype=example_dtype) + elif isinstance(spec, tf.SparseTensorSpec): + if ( + issubclass(generator.get_generator_cls(), VarLenTensorGeneratorBase) + and spec.shape.rank == 1 + ): + return tf.io.VarLenFeature(dtype=example_dtype) + elif ( + issubclass(generator.get_generator_cls(), SparseTensorGeneratorBase) + and spec.shape.is_fully_defined() + ): + index_name = name + TFRecordWriter._SPARSE_INDICES_SUFFIX + rank = len(spec.shape) + index_key = [f"{index_name}{dim}" for dim in range(rank)] + value_key = name + TFRecordWriter._SPARSE_VALUES_SUFFIX + return tf.io.SparseFeature( + index_key=index_key, + value_key=value_key, + dtype=example_dtype, + size=spec.shape, + ) + else: + raise ValueError( + "SparseTensorSpec shape must be either a 1D varlen tensor from VarLenTensorGenerator " + f"or fully defined sparse tensor from SparseTensorGenerator. Found {spec}" + ) + else: + raise TypeError(f"Spec {spec} is not supported in TFRecordWriter.") diff --git a/tools/docker/devel.Dockerfile b/tools/docker/devel.Dockerfile index ab6e8812c..6cb5d39c5 100644 --- a/tools/docker/devel.Dockerfile +++ b/tools/docker/devel.Dockerfile @@ -40,6 +40,8 @@ ARG PIP_ADD_PACKAGES="" RUN /bin/bash -c "source activate tfio-dev && python -m pip install \ avro-python3 \ + python-snappy \ + parameterized \ pytest \ pytest-benchmark \ pylint \ @@ -48,6 +50,7 @@ RUN /bin/bash -c "source activate tfio-dev && python -m pip install \ google-cloud-bigquery-storage==1.1.0 \ pyarrow==${ARROW_VERSION} \ pandas \ + scipy \ fastavro \ gast==0.2.2 \ ${PIP_ADD_PACKAGES} \