From d7418d4d311666075605b67f8335c453fa6356f0 Mon Sep 17 00:00:00 2001
From: Daniel Rizk <rizkytennis@gmail.com>
Date: Thu, 12 Dec 2024 16:57:05 -0500
Subject: [PATCH] fix bug when reading w wildcard

---
 docs/examples/UserGuide/getting_started.jl |  2 +-
 src/TidierDB.jl                            | 31 +++++++++++++++-------
 src/docstrings.jl                          |  2 +-
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl
index 3466458..e1e8dbd 100644
--- a/docs/examples/UserGuide/getting_started.jl
+++ b/docs/examples/UserGuide/getting_started.jl
@@ -49,7 +49,7 @@
 # - `db_table` has two required arguments: `connection` and `table`
 # - `table` can be a table name on a database or a path/url to file to read.  When passing `db_table` a path or url, the table is not copied into memory.
 #   - Of note, `db_table` only support direct file paths to a table. It does not support database file paths such as `dbname.duckdb` or `dbname.sqlite`. Such files must be used with `connect` first.
-# - With DuckDB and ClickHouse, if you have a folder of multiple files to read, you can use `*` read in all files matching the pattern. 
+# - With DuckDB and ClickHouse, if you have a folder of multiple files to read, you can use `*` read in all files matching the pattern, with an optional `alias` argument for what the data should be referred to.
 # - For example, the below would read all files that end in `.csv` in the given folder.
 # ```
 # db_table(db, "folder/path/*.csv")
diff --git a/src/TidierDB.jl b/src/TidierDB.jl
index c0be2a0..89ff989 100644
--- a/src/TidierDB.jl
+++ b/src/TidierDB.jl
@@ -188,7 +188,7 @@ function finalize_query(sqlquery::SQLQuery)
 end
 
 # DuckDB
-function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_name::String)
+function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_name::String; alias::String="")
     set_sql_mode(duckdb());
     if endswith(table_name, ".geoparquet'")
     query = 
@@ -203,12 +203,18 @@ function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_nam
     end
     result = DuckDB.execute(conn, query) |> DataFrame
     result[!, :current_selxn] .= 1
-    table_name = if occursin(r"[:/\\]", table_name)
-        split(basename(table_name), '.')[1]
-        elseif occursin(".", table_name)
-        split(basename(table_name), '.')[end]
+    if occursin("*" , table_name) 
+        if alias != ""
+            table_name = alias
+        else 
+            table_name = "data"
+        end
+    elseif occursin(r"[:/\\]", table_name)
+        table_name = split(basename(table_name), '.')[1]
+    elseif occursin(".", table_name)
+        table_name = split(basename(table_name), '.')[end]
     else
-        table_name
+        table_name = table_name
     end
     if occursin("-" , table_name)
         table_name = replace(table_name, "-" => "_")
@@ -227,7 +233,7 @@ end
 """
 $docstring_db_table
 """
-function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, delta::Bool=false)
+function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, delta::Bool=false, alias::String="")
     table_name = string(table)
     
     if current_sql_mode[] == sqlite()
@@ -249,7 +255,7 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de
            metadata = get_table_metadata(db, table_name2)
         elseif occursin(r"[:/\\]", table_name) 
             table_name2 = "'$table_name'"
-            metadata = get_table_metadata(db, table_name2)
+            metadata = get_table_metadata(db, table_name2; alias = alias)
         else
             metadata = get_table_metadata(db, table_name)
         end
@@ -273,10 +279,15 @@ function db_table(db, table, athena_params::Any=nothing; iceberg::Bool=false, de
     elseif delta
         "delta_scan('$table_name')"
     elseif occursin(r"[:/\\]", table_name) && !(iceberg || delta) && !startswith(table_name, "read") 
+        if occursin(r"\*", table_name)
+             alias = alias == "" ? "data" : alias
+        else 
+            alias = (split(basename(table_name), '.')[1])
+        end 
         name = if occursin(".geoparquet", table_name)
-             "read_parquet('$table_name') AS $(split(basename(table_name), '.')[1]) "
+             "read_parquet('$table_name') AS $alias "
         else
-        "'$table_name' AS $(split(basename(table_name), '.')[1]) "
+             "'$table_name' AS $alias "
         end
         formatted_table_name = begin
             parts = split(name, " AS ")
diff --git a/src/docstrings.jl b/src/docstrings.jl
index 5e2aab5..f95790f 100644
--- a/src/docstrings.jl
+++ b/src/docstrings.jl
@@ -1167,7 +1167,7 @@ name it will not copy it to memory, but rather ready directly from the file. `db
 - `db_table(db, "Path/to/testing_files/*.parquet")`
 - `delta`: must be true to read delta files
 - `iceberg`: must be true to read iceberg finalize_ctes
-
+- `alias`: optional argument when using a `*` wildcard in a file path, that allows user to determine an alias for the data being read in. If empty, it will refer to table as `data`
 # Example
 ```julia