adds from_query w hidden docs portion for now, bumps version, updat…

…es news
TidierOrg · May 4, 2024 · 6eca6bb · 6eca6bb
1 parent a67fa06
commit 6eca6bb
Show file tree

Hide file tree

Showing 9 changed files with 109 additions and 17 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # TidierDB.jl updates
 
+## v0.1.2 - 2024-05-TBD
+- Adds `@full_join`, `@semi_join`, `@anti_join`
+- Adds `connect()` - a universal connection funciton for all supported backends
+- Adds `sql_agg()` - allows any aggregate SQL function not availabe in backend parsers to be used in `@mutate`. Simply write the function as written in SQL syntax as a string wrapped in `sql_agg`, and subsequent windowing is handled by `@mutate`.
+
 ## v0.1.1 - 2024-04-12
 - Fixes metadata retrieval for MariaDB
 - allows for Table.Name style naming in `@select`
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierDB"
 uuid = "86993f9b-bbba-4084-97c5-ee15961ad48b"
 authors = ["Daniel Rizk <rizk.daniel.12@gmail.com> and contributors"]
-version = "0.1.1"
+version = "0.1.2"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

diff --git a/docs/examples/UserGuide/from_queryex.jl b/docs/examples/UserGuide/from_queryex.jl
@@ -0,0 +1,62 @@
+# While using TidierDB, you may need to generate part of a query and reuse it multiple times. `from_query()` enables a query portion to be reused multiple times as shown below.
+
+# ```julia
+# import TidierDB as DB
+# con = DB.connect(:duckdb)
+# DB.copy_to(con, "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv", mtcars2)
+# ```
+
+# Start a query to analyze fuel efficiency by number of cylinders. However, to further build on this query later, end the chain without using `@show_query` or `@collect`
+# ```julia
+# query = DB.@chain DB.db_table(con, :mtcars2) begin
+#     DB.@group_by cyl
+#     DB.@summarize begin
+#         across(mpg, (mean, minimum, maximum))
+#         num_cars = n()
+#         end
+#     DB.@mutate begin
+#         efficiency = case_when(
+#             mean_mpg >= 25, "High",
+#             mean_mpg >= 15, "Moderate",
+#             "Low" )
+#        end
+# end;
+# ```
+
+# Now, `from_query` will allow you to reuse the query to calculate the average horsepower for each efficiency category
+# ```julia
+# DB.@chain DB.from_query(query) begin
+#    DB.@left_join(mtcars2, cyl, cyl)
+#    DB.@group_by(efficiency)
+#    DB.@summarize(avg_hp = mean(hp))
+#    DB.@collect
+# end
+# ```
+# ```
+# 2×2 DataFrame
+#  Row │ efficiency  avg_hp   
+#      │ String?     Float64? 
+# ─────┼──────────────────────
+#    1 │ Moderate    180.238
+#    2 │ High         82.6364
+# ```
+
+# Reuse the query again to find the car with the highest MPG for each cylinder category
+# ```julia
+# DB.@chain DB.from_query(query) begin
+#    DB.@left_join(mtcars2, cyl, cyl)
+#    DB.@group_by cyl
+#    DB.@slice_max(mpg)
+#    DB.@select model cyl mpg
+#    DB.@collect 
+# end
+# ```
+# ```
+# 3×3 DataFrame
+#  Row │ model             cyl     mpg      
+#      │ String?           Int64?  Float64? 
+# ─────┼────────────────────────────────────
+#    1 │ Pontiac Firebird       8      19.2
+#    2 │ Toyota Corolla         4      33.9
+#    3 │ Hornet 4 Drive         6      21.4
+# ```
diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl
@@ -11,23 +11,21 @@
 
 # There are two ways to connect to the database.  you can use `connect` without any need to load any additional packages.
 
-# for example
-
+# For example
+# Connecting to MySQL
 # ```julia
-# Connect to MySQL
 # conn = connect(:mysql; host="localhost", user="root", password="password", db="mydb")
-# versus
-# Connect to DuckDB
-# julia> conn = connect(:duckdb)
+# ```
+# versus connecting to DuckDB
+# ```julia
+# conn = connect(:duckdb)
 # ```
 
-# Alternatively, you can use the packages outlined below and establish a connection directly through their respective methods.
-
-# The associated databased packages used to set up connections are currently as follows
+# Alternatively, you can use the packages outlined below to establish a connection through their respective methods.
 
 # - ClickHouse: ClickHouse.jl
 # - MySQL and MariaDB: MySQL.jl
-# - MSSQL:  ODBC.jl 
+# - MSSQL: ODBC.jl 
 # - Postgres: LibPQ.jl
 # - SQLite: SQLite.jl
 

diff --git a/docs/examples/UserGuide/key_differences.jl b/docs/examples/UserGuide/key_differences.jl
@@ -2,7 +2,7 @@
 
 # ## Creating a database
 
-# For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database.
+# For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MariaDB, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database.
 
 using DataFrames, TidierDB
 
@@ -11,7 +11,7 @@ df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9],
                         value = repeat(1:5, 2), 
                         percent = 0.1:0.1:1.0);
 
-db = connect(:duckdb)
+db = connect(:duckdb);
 
 copy_to(db, df, "df_mem"); # copying over the data frame to an in-memory database
 
@@ -46,15 +46,15 @@ end
 
 # ## Joining
 
-# There is 1 key difference for joining:
+# There is one key difference for joining:
 
 # The column on both the new and old table must be specified. They do not need to be the same, and given SQL behavior where both columns are kept when joining two tables, it is preferable if they have different names. This avoids "ambiguous reference" errors that would otherwise come up and complicate the use of tidy selection for columns. 
 
 df2 = DataFrame(id2 = ["AA", "AC", "AE", "AG", "AI", "AK", "AM"],
                 category = ["X", "Y", "X", "Y", "X", "Y", "X"],
                 score = [88, 92, 77, 83, 95, 68, 74]);
 
- copy_to(db, df2, "df_join");
+copy_to(db, df2, "df_join");
 
 @chain db_table(db, :df_mem) begin
     @left_join(df_join, id2, id)

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -118,4 +118,5 @@ nav:
   - "Home": "index.md"
   - "Key Differences from TidierData.jl" : "examples/generated/UserGuide/key_differences.md"
   - "Getting Started" : "examples/generated/UserGuide/getting_started.md"
+#  - "Reusing Part of a Query" : "examples/generated/UserGuide/from_queryex.md"
   - "Reference" : "reference.md"
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -59,7 +59,7 @@ From TidierDates.jl:
 Supported aggregate functions (as supported by the backend) with more to come
 - `mean`, `minimium`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`
 - `@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work 
-- `agg_str` allows any SQL aggregate function not listed above to be used in `@mutate`. Simply write the function expression as written in SQL syntax as a string wrapped in `agg_str`, and subsequent windowing is handled by `@mutate`.
+- `sql_agg` allows any SQL aggregate function not listed above to be used in `@mutate`. Simply write the function expression as written in SQL syntax as a string wrapped in `sql_agg`, and subsequent windowing is handled by `@mutate`.
 - `copy_to` (for DuckDB, MySQL, SQLite)
 
 DuckDB specifically enables copy_to to directly reading in `.parquet`, `.json`, `.csv`, and `.arrow` file, including https file paths.

diff --git a/src/TidierDB.jl b/src/TidierDB.jl
@@ -22,7 +22,7 @@ import DuckDB: connect as duckdb_connect
  export db_table, set_sql_mode, @arrange, @group_by, @filter, @select, @mutate, @summarize, @summarise, 
  @distinct, @left_join, @right_join, @inner_join, @count, @window_order, @window_frame, @show_query, @collect, @slice_max, 
  @slice_min, @slice_sample, @rename, copy_to, add_interp_parameter!, duckdb_open, duckdb_connect, @semi_join, @full_join, 
- @anti_join, connect
+ @anti_join, connect, from_query
 
 include("docstrings.jl")
 include("structs.jl")

diff --git a/src/structs.jl b/src/structs.jl
@@ -52,3 +52,29 @@ function add_interp_parameter!(name::Symbol, value::Any)
     GLOBAL_CONTEXT.variables[name] = value
 end
 
+function from_query(query::TidierDB.SQLQuery)
+    # Custom copy method for TidierDB.CTE
+    function copy(cte::TidierDB.CTE)
+        return TidierDB.CTE(name=cte.name, select=cte.select, from=cte.from, where=cte.where, groupBy=cte.groupBy, having=cte.having)
+    end
+
+    # Create a new SQLQuery object with the same field values
+    new_query = TidierDB.SQLQuery(
+        select=query.select,
+        from=query.from,
+        where=query.where,
+        groupBy=query.groupBy,
+        orderBy=query.orderBy,
+        having=query.having,
+        window_order=query.window_order,
+        windowFrame=query.windowFrame,
+        is_aggregated=query.is_aggregated,
+        post_aggregation=query.post_aggregation,
+        metadata=deepcopy(query.metadata), 
+        distinct=query.distinct,
+        db=query.db,
+        ctes=[copy(cte) for cte in query.ctes],  
+        cte_count=query.cte_count
+    )
+    return new_query
+end