Skip to content

Commit

Permalink
Add GCS support (#4892)
Browse files Browse the repository at this point in the history
  • Loading branch information
royi-luo authored Feb 18, 2025
1 parent 6ec03df commit 113ba6b
Show file tree
Hide file tree
Showing 36 changed files with 466 additions and 195 deletions.
31 changes: 29 additions & 2 deletions .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ jobs:
GEN: Ninja
CC: gcc
CXX: g++
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
steps:
- uses: actions/checkout@v4

Expand All @@ -82,15 +84,22 @@ jobs:
run: |
bash scripts/generate_binary_tinysnb.sh
s3cmd get s3://kuzu-test/tinysnb/version.txt || echo "0" > version.txt
if [ "$(cat tinysnb/version.txt)" == "$(cat version.txt)" ]; then
s3cmd --access_key=${GCS_ACCESS_KEY_ID} --secret_key=${GCS_SECRET_ACCESS_KEY} --host=storage.googleapis.com --host-bucket="%(bucket)s.storage.googleapis.com" get s3://kuzudb-test/tinysnb/version.txt || echo "0" > version_gcs.txt
if [ "$(cat tinysnb/version.txt)" == "$(cat version.txt)" ] && [ "$(cat tinysnb/version.txt)" == "$(cat version_gcs.txt)" ]; then
echo "TinySNB dataset is up to date, skipping upload"
rm -rf tinysnb version.txt
exit 0
fi
echo "TinySNB dataset is outdated, uploading..."
s3cmd del -r s3://kuzu-test/tinysnb/
s3cmd sync ./tinysnb s3://kuzu-test/
rm -rf tinysnb version.txt
# s3cmd del -r doesn't work on GCS so we individually delete each object in the directory
for gcs_file in $(s3cmd --access_key=${GCS_ACCESS_KEY_ID} --secret_key=${GCS_SECRET_ACCESS_KEY} --host=storage.googleapis.com --host-bucket="%(bucket)s.storage.googleapis.com" ls s3://kuzudb-test/tinysnb/ | awk '{ print $4 }');
do
s3cmd --access_key=${GCS_ACCESS_KEY_ID} --secret_key=${GCS_SECRET_ACCESS_KEY} --host=storage.googleapis.com --host-bucket="%(bucket)s.storage.googleapis.com" del ${gcs_file}
done
s3cmd --access_key=${GCS_ACCESS_KEY_ID} --secret_key=${GCS_SECRET_ACCESS_KEY} --host=storage.googleapis.com --host-bucket="%(bucket)s.storage.googleapis.com" sync ./tinysnb s3://kuzudb-test/
rm -rf tinysnb version.txt version_gcs.txt
- name: Generate and upload ldbc-sf01
run: |
Expand Down Expand Up @@ -131,6 +140,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
RUN_ID: "$(hostname)-$(date +%s)"
HTTP_CACHE_FILE: TRUE

Expand Down Expand Up @@ -205,6 +216,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
RUN_ID: "$(hostname)-$(date +%s)"
HTTP_CACHE_FILE: TRUE

Expand Down Expand Up @@ -425,6 +438,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
RUN_ID: "$(hostname)-$(date +%s)"
HTTP_CACHE_FILE: TRUE
steps:
Expand Down Expand Up @@ -499,6 +514,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
RUN_ID: "$(hostname)-$(date +%s)"
HTTP_CACHE_FILE: TRUE
steps:
Expand Down Expand Up @@ -543,6 +560,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
PG_HOST: ${{ secrets.PG_HOST }}
RUN_ID: "$(hostname)-$([Math]::Floor((Get-Date).TimeOfDay.TotalSeconds))"
HTTP_CACHE_FILE: TRUE
Expand Down Expand Up @@ -799,6 +818,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
PG_HOST: ${{ secrets.PG_HOST }}
RUN_ID: "$(hostname)-$(date +%s)"
HTTP_CACHE_FILE: TRUE
Expand Down Expand Up @@ -923,6 +944,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
RUN_ID: "$(hostname)-$(date +%s)"
HTTP_CACHE_FILE: TRUE
ASAN_OPTIONS: detect_leaks=1
Expand Down Expand Up @@ -977,6 +1000,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
PG_HOST: ${{ secrets.PG_HOST }}
RUN_ID: "$(hostname)-$(date +%s)"
HTTP_CACHE_FILE: TRUE
Expand Down Expand Up @@ -1037,6 +1062,8 @@ jobs:
AWS_S3_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_SECRET_ACCESS_KEY }}
GCS_ACCESS_KEY_ID: ${{ secrets.GCS_ACCESS_KEY_ID }}
GCS_SECRET_ACCESS_KEY: ${{ secrets.GCS_SECRET_ACCESS_KEY }}
PG_HOST: ${{ secrets.PG_HOST }}
RUN_ID: "$(hostname)-$([Math]::Floor((Get-Date).TimeOfDay.TotalSeconds))"
HTTP_CACHE_FILE: TRUE
Expand Down
4 changes: 1 addition & 3 deletions extension/delta/src/connector/delta_connector.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#include "connector/delta_connector.h"

#include "connector/duckdb_secret_manager.h"

namespace kuzu {
namespace delta_extension {

Expand All @@ -15,7 +13,7 @@ void DeltaConnector::connect(const std::string& /*dbPath*/, const std::string& /
executeQuery("load delta;");
executeQuery("install httpfs;");
executeQuery("load httpfs;");
executeQuery(duckdb_extension::DuckDBSecretManager::getS3Secret(context));
initRemoteFSSecrets(context);
}

} // namespace delta_extension
Expand Down
2 changes: 1 addition & 1 deletion extension/delta/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
add_library(kuzu_delta_extension
OBJECT
delta_extension.cpp
${PROJECT_SOURCE_DIR}/extension/httpfs/src/s3_download_options.cpp)
${PROJECT_SOURCE_DIR}/extension/httpfs/src/s3fs_config.cpp)

set(DELTA_EXTENSION_OBJECT_FILES
${DELTA_EXTENSION_OBJECT_FILES} $<TARGET_OBJECTS:kuzu_delta_extension>
Expand Down
6 changes: 3 additions & 3 deletions extension/delta/src/main/delta_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
#include "function/delta_scan.h"
#include "main/client_context.h"
#include "main/database.h"
#include "s3_download_options.h"
#include "main/duckdb_extension.h"
#include "s3fs_config.h"

namespace kuzu {
namespace delta_extension {

void DeltaExtension::load(main::ClientContext* context) {
auto& db = *context->getDatabase();
extension::ExtensionUtils::addTableFunc<DeltaScanFunction>(db);
httpfs::S3DownloadOptions::registerExtensionOptions(&db);
httpfs::S3DownloadOptions::setEnvValue(context);
duckdb_extension::DuckDBExtension::loadRemoteFSOptions(context);
}

} // namespace delta_extension
Expand Down
2 changes: 1 addition & 1 deletion extension/duckdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ add_subdirectory(src/loader)
add_library(duckdb_extension
SHARED
${DUCKDB_EXTENSION_OBJECT_FILES}
${PROJECT_SOURCE_DIR}/extension/httpfs/src/s3_download_options.cpp)
${PROJECT_SOURCE_DIR}/extension/httpfs/src/s3fs_config.cpp)

target_link_libraries(duckdb_extension
PRIVATE
Expand Down
3 changes: 0 additions & 3 deletions extension/duckdb/src/connector/duckdb_connector.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#include "connector/duckdb_connector.h"

#include "common/exception/runtime.h"
#include "main/client_context.h"

namespace kuzu {
namespace duckdb_extension {

Expand Down
35 changes: 16 additions & 19 deletions extension/duckdb/src/connector/duckdb_secret_manager.cpp
Original file line number Diff line number Diff line change
@@ -1,32 +1,29 @@
#include "connector/duckdb_secret_manager.h"

#include "s3_download_options.h"
#include "s3fs_config.h"

namespace kuzu {
namespace duckdb_extension {

static std::string getDuckDBExtensionOptions(main::ClientContext* context, std::string optionName) {
static common::case_insensitive_map_t<std::string> DUCKDB_OPTION_NAMES = {
{httpfs::S3AccessKeyID::NAME, "KEY_ID"}, {httpfs::S3SecretAccessKey::NAME, "SECRET"},
{httpfs::S3EndPoint::NAME, "ENDPOINT"}, {httpfs::S3URLStyle::NAME, "URL_STYLE"},
{httpfs::S3Region::NAME, "REGION"}};
auto optionNameInDuckDB = DUCKDB_OPTION_NAMES.at(optionName);
auto optionValueInKuzu = context->getCurrentSetting(optionName).toString();
return common::stringFormat("{} '{}',", optionNameInDuckDB, optionValueInKuzu);
static std::string getDuckDBExtensionOptions(httpfs::S3AuthParams kuzuOptions) {
std::string options = "";
options.append(common::stringFormat("KEY_ID '{}',", kuzuOptions.accessKeyID));
options.append(common::stringFormat("SECRET '{}',", kuzuOptions.secretAccessKey));
options.append(common::stringFormat("ENDPOINT '{}',", kuzuOptions.endpoint));
options.append(common::stringFormat("URL_STYLE '{}',", kuzuOptions.urlStyle));
options.append(common::stringFormat("REGION '{}',", kuzuOptions.region));
return options;
}

std::string DuckDBSecretManager::getS3Secret(main::ClientContext* context) {
std::string templateQuery = R"(CREATE SECRET s3_secret (
std::string DuckDBSecretManager::getRemoteFSSecret(main::ClientContext* context,
const httpfs::S3FileSystemConfig& config) {
KU_ASSERT(config.fsName == "S3" || config.fsName == "GCS");
std::string templateQuery = R"(CREATE SECRET {}_secret (
{}
TYPE S3
TYPE {}
);)";
std::string options = "";
options += getDuckDBExtensionOptions(context, httpfs::S3AccessKeyID::NAME);
options += getDuckDBExtensionOptions(context, httpfs::S3SecretAccessKey::NAME);
options += getDuckDBExtensionOptions(context, httpfs::S3Region::NAME);
options += getDuckDBExtensionOptions(context, httpfs::S3URLStyle::NAME);
options += getDuckDBExtensionOptions(context, httpfs::S3EndPoint::NAME);
return common::stringFormat(templateQuery, options);
return common::stringFormat(templateQuery, config.fsName,
getDuckDBExtensionOptions(config.getAuthParams(context)), config.fsName);
}

} // namespace duckdb_extension
Expand Down
4 changes: 2 additions & 2 deletions extension/duckdb/src/connector/remote_duckdb_connector.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include "connector/remote_duckdb_connector.h"

#include "common/case_insensitive_map.h"
#include "connector/duckdb_secret_manager.h"
#include "main/client_context.h"
#include "s3fs_config.h"

namespace kuzu {
namespace duckdb_extension {
Expand All @@ -24,7 +24,7 @@ void S3DuckDBConnector::connect(const std::string& dbPath, const std::string& ca
connection = std::make_unique<duckdb::Connection>(*instance);
executeQuery("install httpfs;");
executeQuery("load httpfs;");
executeQuery(DuckDBSecretManager::getS3Secret(context));
initRemoteFSSecrets(context);
executeQuery(common::stringFormat("attach '{}' as {} (read_only);", dbPath, catalogName));
}

Expand Down
9 changes: 8 additions & 1 deletion extension/duckdb/src/include/connector/duckdb_connector.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
#pragma GCC diagnostic pop

#include "binder/binder.h"
#include "common/assert.h"
#include "connector/duckdb_secret_manager.h"
#include "function/duckdb_scan.h"
#include "s3fs_config.h"

namespace kuzu {
namespace main {
Expand Down Expand Up @@ -40,6 +41,12 @@ class DuckDBConnector {

std::unique_ptr<duckdb::MaterializedQueryResult> executeQuery(std::string query) const;

void initRemoteFSSecrets(main::ClientContext* context) const {
for (auto& fsConfig : httpfs::S3FileSystemConfig::getAvailableConfigs()) {
executeQuery(DuckDBSecretManager::getRemoteFSSecret(context, fsConfig));
}
}

protected:
std::unique_ptr<duckdb::DuckDB> instance;
std::unique_ptr<duckdb::Connection> connection;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
#include "main/client_context.h"

namespace kuzu {
namespace httpfs {
struct S3FileSystemConfig;
}

namespace duckdb_extension {

struct DuckDBSecretManager {
static std::string getS3Secret(main::ClientContext* context);
static std::string getRemoteFSSecret(main::ClientContext* context,
const httpfs::S3FileSystemConfig& config);
};

} // namespace duckdb_extension
Expand Down
11 changes: 11 additions & 0 deletions extension/duckdb/src/include/main/duckdb_extension.h
Original file line number Diff line number Diff line change
@@ -1,17 +1,28 @@
#pragma once

#include "extension/extension.h"
#include "main/client_context.h"
#include "s3fs_config.h"

namespace kuzu {
namespace duckdb_extension {

class DuckDBConnector;

class DuckDBExtension final : public extension::Extension {
public:
static constexpr char EXTENSION_NAME[] = "DUCKDB";
static constexpr const char* DEPENDENCY_LIB_FILES[] = {"libduckdb"};

public:
static void load(main::ClientContext* context);
static void loadRemoteFSOptions(main::ClientContext* context) {
auto& db = *context->getDatabase();
for (auto& fsConfig : httpfs::S3FileSystemConfig::getAvailableConfigs()) {
fsConfig.registerExtensionOptions(&db);
fsConfig.setEnvValue(context);
}
}
};

} // namespace duckdb_extension
Expand Down
7 changes: 4 additions & 3 deletions extension/duckdb/src/main/duckdb_extension.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#include "main/duckdb_extension.h"

#include "connector/duckdb_connector.h"
#include "connector/duckdb_secret_manager.h"
#include "main/client_context.h"
#include "s3_download_options.h"
#include "s3fs_config.h"
#include "storage/duckdb_storage.h"

namespace kuzu {
Expand All @@ -10,8 +12,7 @@ namespace duckdb_extension {
void DuckDBExtension::load(main::ClientContext* context) {
auto db = context->getDatabase();
db->registerStorageExtension(EXTENSION_NAME, std::make_unique<DuckDBStorageExtension>(db));
httpfs::S3DownloadOptions::registerExtensionOptions(db);
httpfs::S3DownloadOptions::setEnvValue(context);
DuckDBExtension::loadRemoteFSOptions(context);
}

} // namespace duckdb_extension
Expand Down
2 changes: 1 addition & 1 deletion extension/httpfs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ add_library(httpfs
src/crypto.cpp
src/http_config.cpp
src/cached_file_manager.cpp
src/s3_download_options.cpp)
src/s3fs_config.cpp)

target_link_libraries(httpfs
PRIVATE
Expand Down
5 changes: 3 additions & 2 deletions extension/httpfs/src/httpfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ void HTTPFileInfo::initMetadata() {
std::string rangeLength;
if (res->code != 200) {
if ((flags & FileFlags::WRITE) && res->code == 404) {
if (!(flags & FileFlags::CREATE_IF_NOT_EXISTS)) {
if (!(flags & FileFlags::CREATE_IF_NOT_EXISTS) &&
!(flags & FileFlags::CREATE_AND_TRUNCATE_IF_EXISTS)) {
throw IOException(stringFormat("Unable to open URL: \"{}\" for writing: file does "
"not exist and CREATE flag is not set",
path));
Expand Down Expand Up @@ -137,7 +138,7 @@ std::vector<std::string> HTTPFileSystem::glob(main::ClientContext* /*context*/,
return {path};
}

bool HTTPFileSystem::canHandleFile(const std::string& path) const {
bool HTTPFileSystem::canHandleFile(const std::string_view path) const {
return path.rfind("https://", 0) == 0 || path.rfind("http://", 0) == 0;
}

Expand Down
Loading

0 comments on commit 113ba6b

Please sign in to comment.