diff --git a/.gitignore b/.gitignore index b067edd..a19d79c 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /Manifest.toml +/.vscode \ No newline at end of file diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl index b845fb7..80ac340 100644 --- a/docs/examples/UserGuide/getting_started.jl +++ b/docs/examples/UserGuide/getting_started.jl @@ -1,19 +1,20 @@ -# To use TidierDB.jl, you will have to set up a connection. TidierDB.jl gives you access to duckdb via `duckdb_open` and `duckdb_connect`. However, to use MySql, ClickHouse, MSSQL, Postgres, or SQLite, you will have to load that packages in first. +# To use TidierDB.jl, you will have to set up a connection. TidierDB.jl gives you access to duckdb via `duckdb_open` and `duckdb_connect`. However, to use MySql, ClickHouse, MSSQL, Postgres, or SQLite, you will have to load those packages in first. # If you plan to use TidierDB.jl with TidierData.jl or Tidier.jl, it is most convenenient to load the packages as follows: +# ```julia +# using TidierData +# import TidierDB as DB +# ``` -# - using Tidier # or TidierData -# - import TidierDB as DB. +# Alternatively, `using Tidier` will import TidierDB in the above manner for you, where TidierDB functions and macros will be available as `DB.@mutate()` and so on, and the TidierData equivalent would be `@mutate()`. -# Afterwards, all of the TidierDB macros will be available as DB.@mutate and so on, and the TidierData equivalent would be @mutate. +# The associated databased packages used to set up connections are currently as follows -# The associated databased packages used to set up connections are currently as follows +# - ClickHouse: ClickHouse.jl +# - MySQL: MySQL.jl +# - MSSQL: ODBC.jl +# - Postgres: LibPQ.jl +# - SQLite: SQLite.jl -# - ClickHouse - ClickHouse.jl -# - MySQL - MySQL.jl -# - MSSQL - ODBC.jl -# - Postgres - LibPQ.jl -# - SQLite - SQLite.jl - -# For DuckDB, SQLite, and MySQL, `copy_to` lets you copy data to the database and query there. ClickHouse, MSSQL, and Postgres support for `copy_to` has not been added yet. +# For DuckDB, SQLite, and MySQL, `copy_to()` lets you copy data to the database and query there. ClickHouse, MSSQL, and Postgres support for `copy_to()` has not been added yet. diff --git a/docs/examples/UserGuide/key_differences.jl b/docs/examples/UserGuide/key_differences.jl index c421f20..89fd23f 100644 --- a/docs/examples/UserGuide/key_differences.jl +++ b/docs/examples/UserGuide/key_differences.jl @@ -1,11 +1,11 @@ # There are a few important syntax and behavior differences between TidierDB.jl and TidierData.jl outlined below. -## Starting Chain -# `db_table(connection, :table_name)` is used to start a chain instead of a classic dataframe +# ## Creating a database + +# For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database. + +using DataFrames, TidierDB -## group_by -> mutate -# In TidierDB, when performing `@group_by` then `@mutate`, after applying all of the mutations in the clause to the grouped data, the table is ungrouped. To perform subsequent grouped mutations/slices/summarizations, the user would have to regroup the data. This is something we will work to resolve, but as of version .0.1.0, this is the bevahior. This is demonstrated below with -using TidierDB df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], value = repeat(1:5, 2), @@ -13,28 +13,42 @@ df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], mem = duckdb_open(":memory:"); db = duckdb_connect(mem); -# For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MSSQL, and ClickHouse are possible. -# copy_to(db, df, "df_mem"); # copying over the df to memory + +copy_to(db, df, "df_mem"); # copying over the data frame to an in-memory database + +# ## Row ordering + +# DuckDB benefits from aggressive parallelization of pipelines. This means that if you have multiple threads enabled in Julia, which you can check or set using `Threads.nthreads()`, DuckDB will use multiple threads. However, because many operations are multi-threaded, the resulting row order is inconsistent. If row order needs to be deterministic for your use case, make sure to apply an `@arrange(column_name_1, column_name_2, etc...)` prior to collecting the results. + +# ## Starting a chain + +# When using TidierDB, `db_table(connection, :table_name)` is used to start a chain. + +# ## Grouped mutation + +# In TidierDB, when performing `@group_by` then `@mutate`, the table will be ungrouped after applying all of the mutations in the clause to the grouped data. To perform subsequent grouped operations, the user would have to regroup the data. This is demonstrated below. + @chain db_table(db, :df_mem) begin @group_by(groups) - @summarise(mean = mean(percent)) - @slice_max(percent) + @summarize(mean_percent = mean(percent)) @collect - end + end - @chain db_table(db, :df_mem) begin +@chain db_table(db, :df_mem) begin @group_by(groups) @mutate(max = maximum(percent), min = minimum(percent)) @group_by(groups) - @summarise(mean = mean(percent)) + @summarise(mean_percent = mean(percent)) @collect -end +end + +# ## Joining -## Joining # There are 2 key differences for joining: + # 1. When joining 2 tables, the new table you are choosing to join must be prefixed with a colon. -# 2. The column on both the new and old table must be specified. They do not need to be the same, and given SQL behavior where both columns are kept when joining two tables, it is preferrable if they have different names. This avoids "ambiguous reference" errors that would otherwise come up and complicate the use of tidy selection for columns. +# 2. The column on both the new and old table must be specified. They do not need to be the same, and given SQL behavior where both columns are kept when joining two tables, it is preferable if they have different names. This avoids "ambiguous reference" errors that would otherwise come up and complicate the use of tidy selection for columns. df2 = DataFrame(id2 = ["AA", "AC", "AE", "AG", "AI", "AK", "AM"], category = ["X", "Y", "X", "Y", "X", "Y", "X"], @@ -42,14 +56,16 @@ df2 = DataFrame(id2 = ["AA", "AC", "AE", "AG", "AI", "AK", "AM"], copy_to(db, df2, "df_join"); - @chain db_table(db, :df_mem) begin +@chain db_table(db, :df_mem) begin @left_join(:df_join, id2, id) @collect end -## `case_when` -# In TidierDB, after the clause is completed, the result for the new column should is separated by comma ( , ) -# this is in contrast to TidierData.jl, where the result for the new column is separated by a => +# ## Differences in `case_when()` + +# In TidierDB, after the clause is completed, the result for the new column should is separated by a comma `,` +# in contrast to TidierData.jl, where the result for the new column is separated by a `=>` . + @chain db_table(db, :df_mem) begin @mutate(new_col = case_when(percent > .5, "Pass", # in TidierData, percent > .5 => "Pass", percent <= .5, "Try Again", # percent <= .5 => "Try Again" @@ -57,10 +73,13 @@ end @collect end -## Interpolation -# To use !! Interpolation, instead of being able to define the alternate names/value in the global context, the user has to `add_interp_parameter!`. This will hopefully be fixed in future versions. Otherwise behavior is the same. +# ## Interpolation + +# To use !! Interpolation, instead of being able to define the alternate names/value in the global context, the user has to `add_interp_parameter!`. This will hopefully be fixed in future versions. Otherwise, the behavior is the same. + # Also, when using interpolation with exponenents, the interpolated value must go inside of parenthesis. -# add_interp_parameter!(:test, :percent) # this still supports strings, vectors of names, and values + +add_interp_parameter!(:test, :percent) # this still supports strings, vectors of names, and values @chain db_table(db, :df_mem) begin @mutate(new_col = case_when((!!test)^2 > .5, "Pass", @@ -69,5 +88,6 @@ end @collect end -## Slicing Ties -# Slice will always return ties due to SQL behavior +# ## Slicing ties + +# `slice_min()` and `@slice_max()` will always return ties due to SQL behavior. diff --git a/src/docstrings.jl b/src/docstrings.jl index e63fdc0..b3afa8a 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -23,8 +23,8 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @select(groups:percent) - @collect + @select(groups:percent) + @collect end 10×3 DataFrame Row │ groups value percent @@ -42,8 +42,8 @@ julia> @chain db_table(db, :df_mem) begin 10 │ aa 5 1.0 julia> @chain db_table(db, :df_mem) begin - @select(contains("e")) - @collect + @select(contains("e")) + @collect end 10×2 DataFrame Row │ value percent @@ -90,8 +90,8 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @filter(percent > .5) - @collect + @filter(percent > .5) + @collect end 5×4 DataFrame Row │ id groups value percent @@ -104,20 +104,21 @@ julia> @chain db_table(db, :df_mem) begin 5 │ AJ aa 5 1.0 julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @summarise(mean = mean(percent)) - @filter begin - groups == "bb" || # logical operators can still be used like this - mean > .5 - end - @collect + @group_by(groups) + @summarise(mean = mean(percent)) + @filter begin + groups == "bb" || # logical operators can still be used like this + mean > .5 + end + @arrange(groups) + @collect end 2×2 DataFrame Row │ groups mean │ String? Float64? ─────┼─────────────────── - 1 │ bb 0.5 - 2 │ aa 0.6 + 1 │ aa 0.6 + 2 │ bb 0.5 ``` """ @@ -125,7 +126,7 @@ const docstring_group_by = """ @group_by(sql_query, columns...) -Group SQL table rows by specified column(s). +Group SQL table rows by specified column(s). If grouping is performed as a terminal operation without a subsequent mutatation or summarization (as in the example below), then the resulting data frame will be ungrouped when `@collect` is applied. # Arguments - `sql_query`: The SQL query to operate on. @@ -145,15 +146,16 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @collect - end + @group_by(groups) + @arrange(groups) + @collect + end 2×1 DataFrame Row │ groups │ String? ─────┼───────── - 1 │ bb - 2 │ aa + 1 │ aa + 2 │ bb ``` """ @@ -180,8 +182,8 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @mutate(value = value * 4, new_col = percent^2) - @collect + @mutate(value = value * 4, new_col = percent^2) + @collect end 10×5 DataFrame Row │ id groups value percent new_col @@ -223,28 +225,30 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @summarise(across((ends_with("e"), starts_with("p")), (mean, sum))) - @collect + @group_by(groups) + @summarise(across((ends_with("e"), starts_with("p")), (mean, sum))) + @arrange(groups) + @collect end 2×5 DataFrame Row │ groups mean_value mean_percent sum_value sum_percent │ String? Float64? Float64? Int128? Float64? ─────┼─────────────────────────────────────────────────────────── - 1 │ bb 3.0 0.5 15 2.5 - 2 │ aa 3.0 0.6 15 3.0 + 1 │ aa 3.0 0.6 15 3.0 + 2 │ bb 3.0 0.5 15 2.5 julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @summarise(test = sum(percent), n =n()) - @collect + @group_by(groups) + @summarise(test = sum(percent), n = n()) + @arrange(groups) + @collect end 2×3 DataFrame Row │ groups test n │ String? Float64? Int64? ─────┼─────────────────────────── - 1 │ bb 2.5 5 - 2 │ aa 3.0 5 + 1 │ aa 3.0 5 + 2 │ bb 2.5 5 ``` """ const docstring_summarise = @@ -270,28 +274,30 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @summarise(across((value:percent), (mean, sum))) - @collect + @group_by(groups) + @summarise(across((value:percent), (mean, sum))) + @arrange(groups) + @collect end 2×5 DataFrame Row │ groups mean_value mean_percent sum_value sum_percent │ String? Float64? Float64? Int128? Float64? ─────┼─────────────────────────────────────────────────────────── - 1 │ bb 3.0 0.5 15 2.5 - 2 │ aa 3.0 0.6 15 3.0 + 1 │ aa 3.0 0.6 15 3.0 + 2 │ bb 3.0 0.5 15 2.5 julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @summarise(test = sum(percent), n = n()) - @collect + @group_by(groups) + @summarise(test = sum(percent), n = n()) + @arrange(groups) + @collect end 2×3 DataFrame Row │ groups test n │ String? Float64? Int64? ─────┼─────────────────────────── - 1 │ bb 2.5 5 - 2 │ aa 3.0 5 + 1 │ aa 3.0 5 + 2 │ bb 2.5 5 ``` """ @@ -320,22 +326,14 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @slice_min(value, n = 2) - @collect - end -4×5 DataFrame - Row │ id groups value percent rank_col - │ String? String? Int64? Float64? Int64? -─────┼────────────────────────────────────────────── - 1 │ AG bb 2 0.7 2 - 2 │ AB aa 2 0.2 2 - 3 │ AA bb 1 0.1 1 - 4 │ AF aa 1 0.6 1 + @group_by(groups) + @slice_min(value, n = 2) + @collect + end; julia> @chain db_table(db, :df_mem) begin - @slice_min(value) - @collect + @slice_min(value) + @collect end 2×5 DataFrame Row │ id groups value percent rank_col @@ -371,22 +369,14 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @slice_max(value, n = 2) - @collect - end -4×5 DataFrame - Row │ id groups value percent rank_col - │ String? String? Int64? Float64? Int64? -─────┼────────────────────────────────────────────── - 1 │ AE bb 5 0.5 1 - 2 │ AI bb 4 0.9 2 - 3 │ AJ aa 5 1.0 1 - 4 │ AD aa 4 0.4 2 + @group_by(groups) + @slice_max(value, n = 2) + @collect + end; julia> @chain db_table(db, :df_mem) begin - @slice_max(value) - @collect + @slice_max(value) + @collect end 2×5 DataFrame Row │ id groups value percent rank_col @@ -420,9 +410,9 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @group_by(groups) - @slice_sample(n = 2) - @collect + @group_by(groups) + @slice_sample(n = 2) + @collect end; julia> @chain db_table(db, :df_mem) begin @@ -456,8 +446,8 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @arrange(value, desc(percent)) - @collect + @arrange(value, desc(percent)) + @collect end 10×4 DataFrame Row │ id groups value percent @@ -500,15 +490,16 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @count(groups) - @collect + @count(groups) + @arrange(groups) + @collect end 2×2 DataFrame Row │ groups count │ String? Int64? ─────┼───────────────── - 1 │ bb 5 - 2 │ aa 5 + 1 │ aa 5 + 2 │ bb 5 ``` """ @@ -537,8 +528,9 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin - @distinct value - @collect + @distinct(value) + @arrange(value) + @collect end 5×1 DataFrame Row │ value @@ -551,8 +543,9 @@ julia> @chain db_table(db, :df_mem) begin 5 │ 5 julia> @chain db_table(db, :df_mem) begin - @distinct - @collect + @distinct + @arrange(id) + @collect end 10×4 DataFrame Row │ id groups value percent @@ -606,8 +599,8 @@ julia> copy_to(db, df, "df_mem"); julia> copy_to(db, df2, "df_join"); julia> @chain db_table(db, :df_mem) begin - @left_join(:df_join, id2, id) - @collect + @left_join(:df_join, id2, id) + @collect end 10×7 DataFrame Row │ id groups value percent id2 category score @@ -661,8 +654,8 @@ julia> copy_to(db, df, "df_mem"); julia> copy_to(db, df2, "df_join"); julia> @chain db_table(db, :df_mem) begin - @right_join(:df_join, id2, id) - @collect + @right_join(:df_join, id2, id) + @collect end 7×7 DataFrame Row │ id groups value percent id2 category score @@ -713,8 +706,8 @@ julia> copy_to(db, df, "df_mem"); julia> copy_to(db, df2, "df_join"); julia> @chain db_table(db, :df_mem) begin - @inner_join(:df_join, id2, id) - @collect + @inner_join(:df_join, id2, id) + @collect end 5×7 DataFrame Row │ id groups value percent id2 category score @@ -753,13 +746,23 @@ julia> copy_to(db, df, "df_mem"); julia> @chain db_table(db, :df_mem) begin @rename(new_name = percent) - @show_query + @collect end -WITH cte_1 AS ( -SELECT id, groups, value, percent AS new_name - FROM df_mem) -SELECT * - FROM cte_1 +10×4 DataFrame + Row │ id groups value new_name + │ String? String? Int64? Float64? +─────┼──────────────────────────────────── + 1 │ AA bb 1 0.1 + 2 │ AB aa 2 0.2 + 3 │ AC bb 3 0.3 + 4 │ AD aa 4 0.4 + 5 │ AE bb 5 0.5 + 6 │ AF aa 1 0.6 + 7 │ AG bb 2 0.7 + 8 │ AH aa 3 0.8 + 9 │ AI bb 4 0.9 + 10 │ AJ aa 5 1.0 +``` """ const docstring_copy_to = @@ -836,6 +839,4 @@ julia> db = duckdb_connect(mem); julia> copy_to(db, df, "df_mem"); ``` -""" - - +""" \ No newline at end of file