From d7418d4d311666075605b67f8335c453fa6356f0 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Thu, 12 Dec 2024 16:57:05 -0500 Subject: [PATCH] fix bug when reading w wildcard --- docs/examples/UserGuide/getting_started.jl | 2 +- src/TidierDB.jl | 31 +++++++++++++++------- src/docstrings.jl | 2 +- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl index 3466458..e1e8dbd 100644 --- a/docs/examples/UserGuide/getting_started.jl +++ b/docs/examples/UserGuide/getting_started.jl @@ -49,7 +49,7 @@ # - `db_table` has two required arguments: `connection` and `table` # - `table` can be a table name on a database or a path/url to file to read. When passing `db_table` a path or url, the table is not copied into memory. # - Of note, `db_table` only support direct file paths to a table. It does not support database file paths such as `dbname.duckdb` or `dbname.sqlite`. Such files must be used with `connect` first. -# - With DuckDB and ClickHouse, if you have a folder of multiple files to read, you can use `*` read in all files matching the pattern. +# - With DuckDB and ClickHouse, if you have a folder of multiple files to read, you can use `*` read in all files matching the pattern, with an optional `alias` argument for what the data should be referred to. # - For example, the below would read all files that end in `.csv` in the given folder. # ``` # db_table(db, "folder/path/*.csv") diff --git a/src/TidierDB.jl b/src/TidierDB.jl index c0be2a0..89ff989 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -188,7 +188,7 @@ function finalize_query(sqlquery::SQLQuery) end # DuckDB -function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_name::String) +function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_name::String; alias::String="") set_sql_mode(duckdb()); if endswith(table_name, ".geoparquet'") query = @@ -203,12 +203,18 @@ function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_nam end result = DuckDB.execute(conn, query) |> DataFrame result[!, :current_selxn] .= 1 - table_name = if occursin(r"[:/\\]", table_name) - split(basename(table_name), '.')[1] - elseif occursin(".", table_name) - split(basename(table_name), '.')[end] + if occursin("*" , table_name) + if alias != "" + table_name = alias + else + table_name = "data" + end + elseif occursin(r"[:/\\]", table_name) + table_name = split(basename(table_name), '.')[1] + elseif occursin(".", table_name) + table_name = split(basename(table_name), '.')[end] else - table_name + table_name = table_name end if occursin("-" , table_name) table_name = replace(table_name, "-" => "_") @@ -227,7 +233,7 @@ end """ $docstring_db_table """ -function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, delta::Bool=false) +function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, delta::Bool=false, alias::String="") table_name = string(table) if current_sql_mode[] == sqlite() @@ -249,7 +255,7 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de metadata = get_table_metadata(db, table_name2) elseif occursin(r"[:/\\]", table_name) table_name2 = "'$table_name'" - metadata = get_table_metadata(db, table_name2) + metadata = get_table_metadata(db, table_name2; alias = alias) else metadata = get_table_metadata(db, table_name) end @@ -273,10 +279,15 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de elseif delta "delta_scan('$table_name')" elseif occursin(r"[:/\\]", table_name) && !(iceberg || delta) && !startswith(table_name, "read") + if occursin(r"\*", table_name) + alias = alias == "" ? "data" : alias + else + alias = (split(basename(table_name), '.')[1]) + end name = if occursin(".geoparquet", table_name) - "read_parquet('$table_name') AS $(split(basename(table_name), '.')[1]) " + "read_parquet('$table_name') AS $alias " else - "'$table_name' AS $(split(basename(table_name), '.')[1]) " + "'$table_name' AS $alias " end formatted_table_name = begin parts = split(name, " AS ") diff --git a/src/docstrings.jl b/src/docstrings.jl index 5e2aab5..f95790f 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -1167,7 +1167,7 @@ name it will not copy it to memory, but rather ready directly from the file. `db - `db_table(db, "Path/to/testing_files/*.parquet")` - `delta`: must be true to read delta files - `iceberg`: must be true to read iceberg finalize_ctes - +- `alias`: optional argument when using a `*` wildcard in a file path, that allows user to determine an alias for the data being read in. If empty, it will refer to table as `data` # Example ```julia