-
Notifications
You must be signed in to change notification settings - Fork 288
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
201 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
--- a/c++/src/OrcHdfsFile.cc 2022-04-11 04:30:41.000000000 +0800 | ||
+++ b/c++/src/OrcHdfsFile.cc 2022-04-11 19:56:37.206680217 +0800 | ||
@@ -1,4 +1,5 @@ | ||
/** | ||
+ * 1 | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
@@ -29,145 +30,57 @@ | ||
#include <sys/types.h> | ||
#include <unistd.h> | ||
|
||
-#include "hdfspp/hdfspp.h" | ||
+#include "tensorflow/core/platform/env.h" | ||
+#include "tensorflow/core/platform/file_system.h" | ||
+#include "tensorflow/core/platform/logging.h" | ||
+#include "tensorflow/core/platform/status.h" | ||
+#include "tensorflow/core/platform/types.h" | ||
|
||
namespace orc { | ||
|
||
- class HdfsFileInputStream : public InputStream { | ||
- private: | ||
- std::string filename; | ||
- std::unique_ptr<hdfs::FileHandle> file; | ||
- std::unique_ptr<hdfs::FileSystem> file_system; | ||
- uint64_t totalLength; | ||
- const uint64_t READ_SIZE = 1024 * 1024; //1 MB | ||
- | ||
- public: | ||
- HdfsFileInputStream(std::string _filename) { | ||
- filename = _filename ; | ||
- | ||
- //Building a URI object from the given uri_path | ||
- hdfs::URI uri; | ||
- try { | ||
- uri = hdfs::URI::parse_from_string(filename); | ||
- } catch (const hdfs::uri_parse_error&) { | ||
- throw ParseError("Malformed URI: " + filename); | ||
- } | ||
- | ||
- //This sets conf path to default "$HADOOP_CONF_DIR" or "/etc/hadoop/conf" | ||
- //and loads configs core-site.xml and hdfs-site.xml from the conf path | ||
- hdfs::ConfigParser parser; | ||
- if(!parser.LoadDefaultResources()){ | ||
- throw ParseError("Could not load default resources. "); | ||
- } | ||
- auto stats = parser.ValidateResources(); | ||
- //validating core-site.xml | ||
- if(!stats[0].second.ok()){ | ||
- throw ParseError(stats[0].first + " is invalid: " + stats[0].second.ToString()); | ||
- } | ||
- //validating hdfs-site.xml | ||
- if(!stats[1].second.ok()){ | ||
- throw ParseError(stats[1].first + " is invalid: " + stats[1].second.ToString()); | ||
- } | ||
- hdfs::Options options; | ||
- if(!parser.get_options(options)){ | ||
- throw ParseError("Could not load Options object. "); | ||
- } | ||
- hdfs::IoService * io_service = hdfs::IoService::New(); | ||
- //Wrapping file_system into a unique pointer to guarantee deletion | ||
- file_system = std::unique_ptr<hdfs::FileSystem>( | ||
- hdfs::FileSystem::New(io_service, "", options)); | ||
- if (file_system.get() == nullptr) { | ||
- throw ParseError("Can't create FileSystem object. "); | ||
- } | ||
- hdfs::Status status; | ||
- //Checking if the user supplied the host | ||
- if(!uri.get_host().empty()){ | ||
- //Using port if supplied, otherwise using "" to look up port in configs | ||
- std::string port = uri.has_port() ? | ||
- std::to_string(uri.get_port()) : ""; | ||
- status = file_system->Connect(uri.get_host(), port); | ||
- if (!status.ok()) { | ||
- throw ParseError("Can't connect to " + uri.get_host() | ||
- + ":" + port + ". " + status.ToString()); | ||
- } | ||
- } else { | ||
- status = file_system->ConnectToDefaultFs(); | ||
- if (!status.ok()) { | ||
- if(!options.defaultFS.get_host().empty()){ | ||
- throw ParseError("Error connecting to " + | ||
- options.defaultFS.str() + ". " + status.ToString()); | ||
- } else { | ||
- throw ParseError( | ||
- "Error connecting to the cluster: defaultFS is empty. " | ||
- + status.ToString()); | ||
- } | ||
- } | ||
- } | ||
- | ||
- if (file_system.get() == nullptr) { | ||
- throw ParseError("Can't connect the file system. "); | ||
- } | ||
- | ||
- hdfs::FileHandle *file_raw = nullptr; | ||
- status = file_system->Open(uri.get_path(), &file_raw); | ||
- if (!status.ok()) { | ||
- throw ParseError("Can't open " | ||
- + uri.get_path() + ". " + status.ToString()); | ||
- } | ||
- //Wrapping file_raw into a unique pointer to guarantee deletion | ||
- file.reset(file_raw); | ||
- | ||
- hdfs::StatInfo stat_info; | ||
- status = file_system->GetFileInfo(uri.get_path(), stat_info); | ||
- if (!status.ok()) { | ||
- throw ParseError("Can't stat " | ||
- + uri.get_path() + ". " + status.ToString()); | ||
- } | ||
- totalLength = stat_info.length; | ||
+class HdfsFileInputStream : public InputStream { | ||
+ private: | ||
+ std::string filename_; | ||
+ std::unique_ptr<tensorflow::RandomAccessFile> file_; | ||
+ uint64_t total_length_; | ||
+ const uint64_t READ_SIZE = 1024 * 1024; // 1 MB | ||
+ | ||
+ public: | ||
+ HdfsFileInputStream(std::string filename) { | ||
+ filename_ = filename; | ||
+ tensorflow::Status status = | ||
+ tensorflow::Env::Default()->NewRandomAccessFile(filename_, &file_); | ||
+ if (!status.ok()) { | ||
+ LOG(FATAL) << status.ToString(); | ||
} | ||
|
||
- uint64_t getLength() const override { | ||
- return totalLength; | ||
- } | ||
+ tensorflow::Env::Default()->GetFileSize(filename_, &total_length_); | ||
+ } | ||
|
||
- uint64_t getNaturalReadSize() const override { | ||
- return READ_SIZE; | ||
- } | ||
+ uint64_t getLength() const override { return total_length_; } | ||
|
||
- void read(void* buf, | ||
- uint64_t length, | ||
- uint64_t offset) override { | ||
- | ||
- if (!buf) { | ||
- throw ParseError("Buffer is null"); | ||
- } | ||
- | ||
- hdfs::Status status; | ||
- size_t total_bytes_read = 0; | ||
- size_t last_bytes_read = 0; | ||
- | ||
- do { | ||
- status = file->PositionRead(buf, | ||
- static_cast<size_t>(length) - total_bytes_read, | ||
- static_cast<off_t>(offset + total_bytes_read), &last_bytes_read); | ||
- if(!status.ok()) { | ||
- throw ParseError("Error reading the file: " + status.ToString()); | ||
- } | ||
- total_bytes_read += last_bytes_read; | ||
- } while (total_bytes_read < length); | ||
- } | ||
+ uint64_t getNaturalReadSize() const override { return READ_SIZE; } | ||
|
||
- const std::string& getName() const override { | ||
- return filename; | ||
+ void read(void* buf, uint64_t length, uint64_t offset) override { | ||
+ if (!buf) { | ||
+ LOG(FATAL) << " Null buf"; | ||
+ } | ||
+ tensorflow::StringPiece sp; | ||
+ tensorflow::Status s = | ||
+ file_->Read(offset, length, &sp, static_cast<char*>(buf)); | ||
+ if (!(s.ok() || tensorflow::errors::IsOutOfRange(s))) { | ||
+ LOG(FATAL) << s.ToString(); | ||
} | ||
+ } | ||
|
||
- ~HdfsFileInputStream() override; | ||
- }; | ||
+ const std::string& getName() const override { return filename_; } | ||
|
||
- HdfsFileInputStream::~HdfsFileInputStream() { | ||
- } | ||
+ ~HdfsFileInputStream() override; | ||
+}; | ||
|
||
- std::unique_ptr<InputStream> readHdfsFile(const std::string& path) { | ||
- return std::unique_ptr<InputStream>(new HdfsFileInputStream(path)); | ||
- } | ||
+HdfsFileInputStream::~HdfsFileInputStream() {} | ||
+ | ||
+std::unique_ptr<InputStream> readHdfsFile(const std::string& path) { | ||
+ return std::unique_ptr<InputStream>(new HdfsFileInputStream(path)); | ||
} | ||
+} // namespace orc |