diff --git a/NEWS.md b/NEWS.md index 38f0552..147d271 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # TidierDB.jl updates +## v0.1.2 - 2024-05-TBD +- Adds `@full_join`, `@semi_join`, `@anti_join` +- Adds `connect()` - a universal connection funciton for all supported backends +- Adds `sql_agg()` - allows any aggregate SQL function not availabe in backend parsers to be used in `@mutate`. Simply write the function as written in SQL syntax as a string wrapped in `sql_agg`, and subsequent windowing is handled by `@mutate`. + ## v0.1.1 - 2024-04-12 - Fixes metadata retrieval for MariaDB - allows for Table.Name style naming in `@select` \ No newline at end of file diff --git a/Project.toml b/Project.toml index 3dd4f74..0e19edc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierDB" uuid = "86993f9b-bbba-4084-97c5-ee15961ad48b" authors = ["Daniel Rizk and contributors"] -version = "0.1.1" +version = "0.1.2" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" diff --git a/docs/examples/UserGuide/from_queryex.jl b/docs/examples/UserGuide/from_queryex.jl new file mode 100644 index 0000000..bc83f99 --- /dev/null +++ b/docs/examples/UserGuide/from_queryex.jl @@ -0,0 +1,62 @@ +# While using TidierDB, you may need to generate part of a query and reuse it multiple times. `from_query()` enables a query portion to be reused multiple times as shown below. + +# ```julia +# import TidierDB as DB +# con = DB.connect(:duckdb) +# DB.copy_to(con, "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv", mtcars2) +# ``` + +# Start a query to analyze fuel efficiency by number of cylinders. However, to further build on this query later, end the chain without using `@show_query` or `@collect` +# ```julia +# query = DB.@chain DB.db_table(con, :mtcars2) begin +# DB.@group_by cyl +# DB.@summarize begin +# across(mpg, (mean, minimum, maximum)) +# num_cars = n() +# end +# DB.@mutate begin +# efficiency = case_when( +# mean_mpg >= 25, "High", +# mean_mpg >= 15, "Moderate", +# "Low" ) +# end +# end; +# ``` + +# Now, `from_query` will allow you to reuse the query to calculate the average horsepower for each efficiency category +# ```julia +# DB.@chain DB.from_query(query) begin +# DB.@left_join(mtcars2, cyl, cyl) +# DB.@group_by(efficiency) +# DB.@summarize(avg_hp = mean(hp)) +# DB.@collect +# end +# ``` +# ``` +# 2×2 DataFrame +# Row │ efficiency avg_hp +# │ String? Float64? +# ─────┼────────────────────── +# 1 │ Moderate 180.238 +# 2 │ High 82.6364 +# ``` + +# Reuse the query again to find the car with the highest MPG for each cylinder category +# ```julia +# DB.@chain DB.from_query(query) begin +# DB.@left_join(mtcars2, cyl, cyl) +# DB.@group_by cyl +# DB.@slice_max(mpg) +# DB.@select model cyl mpg +# DB.@collect +# end +# ``` +# ``` +# 3×3 DataFrame +# Row │ model cyl mpg +# │ String? Int64? Float64? +# ─────┼──────────────────────────────────── +# 1 │ Pontiac Firebird 8 19.2 +# 2 │ Toyota Corolla 4 33.9 +# 3 │ Hornet 4 Drive 6 21.4 +# ``` \ No newline at end of file diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl index bc9a761..437f7c1 100644 --- a/docs/examples/UserGuide/getting_started.jl +++ b/docs/examples/UserGuide/getting_started.jl @@ -11,23 +11,21 @@ # There are two ways to connect to the database. you can use `connect` without any need to load any additional packages. -# for example - +# For example +# Connecting to MySQL # ```julia -# Connect to MySQL # conn = connect(:mysql; host="localhost", user="root", password="password", db="mydb") -# versus -# Connect to DuckDB -# julia> conn = connect(:duckdb) +# ``` +# versus connecting to DuckDB +# ```julia +# conn = connect(:duckdb) # ``` -# Alternatively, you can use the packages outlined below and establish a connection directly through their respective methods. - -# The associated databased packages used to set up connections are currently as follows +# Alternatively, you can use the packages outlined below to establish a connection through their respective methods. # - ClickHouse: ClickHouse.jl # - MySQL and MariaDB: MySQL.jl -# - MSSQL: ODBC.jl +# - MSSQL: ODBC.jl # - Postgres: LibPQ.jl # - SQLite: SQLite.jl diff --git a/docs/examples/UserGuide/key_differences.jl b/docs/examples/UserGuide/key_differences.jl index de65d71..36e8e0a 100644 --- a/docs/examples/UserGuide/key_differences.jl +++ b/docs/examples/UserGuide/key_differences.jl @@ -2,7 +2,7 @@ # ## Creating a database -# For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database. +# For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MariaDB, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database. using DataFrames, TidierDB @@ -11,7 +11,7 @@ df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], value = repeat(1:5, 2), percent = 0.1:0.1:1.0); -db = connect(:duckdb) +db = connect(:duckdb); copy_to(db, df, "df_mem"); # copying over the data frame to an in-memory database @@ -46,7 +46,7 @@ end # ## Joining -# There is 1 key difference for joining: +# There is one key difference for joining: # The column on both the new and old table must be specified. They do not need to be the same, and given SQL behavior where both columns are kept when joining two tables, it is preferable if they have different names. This avoids "ambiguous reference" errors that would otherwise come up and complicate the use of tidy selection for columns. @@ -54,7 +54,7 @@ df2 = DataFrame(id2 = ["AA", "AC", "AE", "AG", "AI", "AK", "AM"], category = ["X", "Y", "X", "Y", "X", "Y", "X"], score = [88, 92, 77, 83, 95, 68, 74]); - copy_to(db, df2, "df_join"); +copy_to(db, df2, "df_join"); @chain db_table(db, :df_mem) begin @left_join(df_join, id2, id) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 416e77f..3f2c8c7 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -118,4 +118,5 @@ nav: - "Home": "index.md" - "Key Differences from TidierData.jl" : "examples/generated/UserGuide/key_differences.md" - "Getting Started" : "examples/generated/UserGuide/getting_started.md" +# - "Reusing Part of a Query" : "examples/generated/UserGuide/from_queryex.md" - "Reference" : "reference.md" \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 50db49e..e1046bf 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -59,7 +59,7 @@ From TidierDates.jl: Supported aggregate functions (as supported by the backend) with more to come - `mean`, `minimium`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var` - `@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work -- `agg_str` allows any SQL aggregate function not listed above to be used in `@mutate`. Simply write the function expression as written in SQL syntax as a string wrapped in `agg_str`, and subsequent windowing is handled by `@mutate`. +- `sql_agg` allows any SQL aggregate function not listed above to be used in `@mutate`. Simply write the function expression as written in SQL syntax as a string wrapped in `sql_agg`, and subsequent windowing is handled by `@mutate`. - `copy_to` (for DuckDB, MySQL, SQLite) DuckDB specifically enables copy_to to directly reading in `.parquet`, `.json`, `.csv`, and `.arrow` file, including https file paths. diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 31c61d9..6b69c3a 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -22,7 +22,7 @@ import DuckDB: connect as duckdb_connect export db_table, set_sql_mode, @arrange, @group_by, @filter, @select, @mutate, @summarize, @summarise, @distinct, @left_join, @right_join, @inner_join, @count, @window_order, @window_frame, @show_query, @collect, @slice_max, @slice_min, @slice_sample, @rename, copy_to, add_interp_parameter!, duckdb_open, duckdb_connect, @semi_join, @full_join, - @anti_join, connect + @anti_join, connect, from_query include("docstrings.jl") include("structs.jl") diff --git a/src/structs.jl b/src/structs.jl index fb663c7..4916b8e 100644 --- a/src/structs.jl +++ b/src/structs.jl @@ -52,3 +52,29 @@ function add_interp_parameter!(name::Symbol, value::Any) GLOBAL_CONTEXT.variables[name] = value end +function from_query(query::TidierDB.SQLQuery) + # Custom copy method for TidierDB.CTE + function copy(cte::TidierDB.CTE) + return TidierDB.CTE(name=cte.name, select=cte.select, from=cte.from, where=cte.where, groupBy=cte.groupBy, having=cte.having) + end + + # Create a new SQLQuery object with the same field values + new_query = TidierDB.SQLQuery( + select=query.select, + from=query.from, + where=query.where, + groupBy=query.groupBy, + orderBy=query.orderBy, + having=query.having, + window_order=query.window_order, + windowFrame=query.windowFrame, + is_aggregated=query.is_aggregated, + post_aggregation=query.post_aggregation, + metadata=deepcopy(query.metadata), + distinct=query.distinct, + db=query.db, + ctes=[copy(cte) for cte in query.ctes], + cte_count=query.cte_count + ) + return new_query +end \ No newline at end of file