-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Delta lake data source (initial implementation) (#1119)
* delta stubs * Accessor (s3) * Downgrade datafusion + use storage opts when opening table * fmt
- Loading branch information
Showing
20 changed files
with
1,088 additions
and
112 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
use crate::delta::catalog::{DataCatalog, UnityCatalog}; | ||
use crate::delta::errors::Result; | ||
use deltalake::DeltaTable; | ||
use metastoreproto::types::options::{DeltaLakeCatalog, DeltaLakeUnityCatalog}; | ||
use std::collections::HashMap; | ||
use std::sync::Arc; | ||
use tracing::debug; | ||
|
||
/// Access a delta lake. | ||
pub struct DeltaLakeAccessor { | ||
catalog: Arc<dyn DataCatalog>, | ||
region: String, | ||
access_key_id: String, | ||
secret_access_key: String, | ||
} | ||
|
||
impl DeltaLakeAccessor { | ||
/// Connect to a deltalake using the provided catalog information. | ||
// TODO: Allow accessing delta tables without a catalog? | ||
// TODO: Don't be S3 specific. | ||
pub async fn connect( | ||
catalog: &DeltaLakeCatalog, | ||
access_key_id: &str, | ||
secret_access_key: &str, | ||
region: &str, | ||
) -> Result<DeltaLakeAccessor> { | ||
let catalog: Arc<dyn DataCatalog> = match catalog { | ||
DeltaLakeCatalog::Unity(DeltaLakeUnityCatalog { | ||
catalog_id, | ||
databricks_access_token, | ||
workspace_url, | ||
}) => { | ||
let catalog = | ||
UnityCatalog::connect(databricks_access_token, workspace_url, catalog_id) | ||
.await?; | ||
Arc::new(catalog) | ||
} | ||
}; | ||
|
||
Ok(DeltaLakeAccessor { | ||
catalog, | ||
region: region.to_string(), | ||
access_key_id: access_key_id.to_string(), | ||
secret_access_key: secret_access_key.to_string(), | ||
}) | ||
} | ||
|
||
pub async fn load_table(self, database: &str, table: &str) -> Result<DeltaTable> { | ||
let loc = self | ||
.catalog | ||
.get_table_storage_location(database, table) | ||
.await?; | ||
|
||
debug!(%loc, %database, %table, "deltalake location"); | ||
|
||
let mut opts = HashMap::new(); | ||
opts.insert("aws_access_key_id".to_string(), self.access_key_id); | ||
opts.insert("aws_secret_access_key".to_string(), self.secret_access_key); | ||
opts.insert("aws_region".to_string(), self.region); | ||
|
||
let table = deltalake::open_table_with_storage_options(loc, opts).await?; | ||
|
||
// Note that the deltalake crate does the appropriate jank for | ||
// registering the object store in the datafusion session's runtime env | ||
// during execution. | ||
Ok(table) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
//! Delta lake catalog implementations. | ||
//! | ||
//! Most of this was copied in from the `deltalake` crate to make some | ||
//! modifications with how we construct clients, and what errors get returned. | ||
use crate::delta::errors::{DeltaError, Result}; | ||
use async_trait::async_trait; | ||
use reqwest::header; | ||
use serde::Deserialize; | ||
|
||
#[async_trait] | ||
pub trait DataCatalog: Sync + Send { | ||
/// Get the storage location for a given table. | ||
async fn get_table_storage_location( | ||
&self, | ||
database_name: &str, // "schema" | ||
table_name: &str, | ||
) -> Result<String>; | ||
} | ||
|
||
/// Databricks Unity Catalog - implementation of the `DataCatalog` trait | ||
#[derive(Debug, Clone)] | ||
pub struct UnityCatalog { | ||
client: reqwest::Client, | ||
workspace_url: String, | ||
catalog_id: String, | ||
} | ||
|
||
impl UnityCatalog { | ||
pub async fn connect( | ||
access_token: &str, | ||
workspace_url: &str, | ||
catalog_id: &str, | ||
) -> Result<Self> { | ||
let auth_header_val = header::HeaderValue::from_str(&format!("Bearer {}", &access_token)) | ||
.map_err(|_| DeltaError::Static("Invalid Databricks access token"))?; | ||
|
||
let headers = header::HeaderMap::from_iter([(header::AUTHORIZATION, auth_header_val)]); | ||
let client = reqwest::Client::builder() | ||
.default_headers(headers) | ||
.build()?; | ||
|
||
// Check that we can reach the databricks workspace. | ||
let _resp = client | ||
.get(format!("{}/api/2.1/unity-catalog/catalogs", workspace_url)) | ||
.send() | ||
.await?; | ||
|
||
Ok(Self { | ||
client, | ||
workspace_url: workspace_url.to_string(), | ||
catalog_id: catalog_id.to_string(), | ||
}) | ||
} | ||
} | ||
|
||
#[derive(Deserialize)] | ||
#[serde(untagged)] | ||
enum TableResponse { | ||
Success { storage_location: String }, | ||
Error { error_code: String, message: String }, | ||
} | ||
|
||
#[async_trait] | ||
impl DataCatalog for UnityCatalog { | ||
/// Get the table storage location from the UnityCatalog | ||
async fn get_table_storage_location( | ||
&self, | ||
database_name: &str, | ||
table_name: &str, | ||
) -> Result<String> { | ||
let resp = self | ||
.client | ||
.get(format!( | ||
"{}/api/2.1/unity-catalog/tables/{}.{}.{}", | ||
&self.workspace_url, self.catalog_id, database_name, table_name | ||
)) | ||
.send() | ||
.await?; | ||
|
||
let parsed_resp: TableResponse = resp.json().await?; | ||
match parsed_resp { | ||
TableResponse::Success { storage_location } => Ok(storage_location), | ||
TableResponse::Error { | ||
error_code, | ||
message, | ||
} => Err(DeltaError::UnityInvalidTable { | ||
error_code, | ||
message, | ||
}), | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#[derive(Debug, thiserror::Error)] | ||
pub enum DeltaError { | ||
#[error(transparent)] | ||
DeltaTable(#[from] deltalake::DeltaTableError), | ||
|
||
#[error("Invalid table error from unity catalog: {error_code}: {message}")] | ||
UnityInvalidTable { error_code: String, message: String }, | ||
|
||
#[error(transparent)] | ||
Reqwest(#[from] reqwest::Error), | ||
|
||
#[error(transparent)] | ||
ObjectStore(#[from] object_store::Error), | ||
|
||
#[error(transparent)] | ||
DataFusion(#[from] datafusion::error::DataFusionError), | ||
|
||
#[error(transparent)] | ||
UrlParse(#[from] url::ParseError), | ||
|
||
#[error("{0}")] | ||
Static(&'static str), | ||
} | ||
|
||
pub type Result<T, E = DeltaError> = std::result::Result<T, E>; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pub mod access; | ||
pub mod catalog; | ||
pub mod errors; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
pub mod bigquery; | ||
pub mod debug; | ||
pub mod delta; | ||
pub mod mongodb; | ||
pub mod mysql; | ||
pub mod object_store; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.