diff --git a/previews/PR28/reference/index.html b/previews/PR28/reference/index.html index 37e9640..4d957c5 100644 --- a/previews/PR28/reference/index.html +++ b/previews/PR28/reference/index.html @@ -621,7 +621,7 @@
#
TidierDB.@anti_join
— Macro.
@anti_join(sql_query, join_table, new_table_col, orignal_table_col)
@@ -687,7 +687,7 @@ Reference - Exported functions 4 │ AH aa 3 0.8
5 │ AJ aa 5 1.0
#
TidierDB.@arrange
— Macro.
@arrange(sql_query, columns...)
@@ -727,7 +727,7 @@ Reference - Exported functions 9 │ AJ aa 5 1.0
10 │ AE bb 5 0.5
#
TidierDB.@count
— Macro.
@count(sql_query, columns...)
@@ -760,7 +760,7 @@ Reference - Exported functions 1 │ aa 5
2 │ bb 5
#
TidierDB.@distinct
— Macro.
@distinct(sql_query, columns...)
@@ -813,7 +813,7 @@ Reference - Exported functions 9 │ AI bb 4 0.9
10 │ AJ aa 5 1.0
#
TidierDB.@filter
— Macro.
@filter(sql_query, conditions...)
@@ -869,7 +869,7 @@ Reference - Exported functions 1 │ aa 0.6
2 │ bb 0.5
#
TidierDB.@full_join
— Macro.
@inner_join(sql_query, join_table, new_table_col, orignal_table_col)
@@ -919,7 +919,7 @@ Reference - Exported functions 11 │ missing missing missing missing AK Y 68
12 │ missing missing missing missing AM X 74
#
TidierDB.@group_by
— Macro.
@group_by(sql_query, columns...)
@@ -952,7 +952,7 @@ Reference - Exported functions 1 │ aa
2 │ bb
#
TidierDB.@inner_join
— Macro.
@inner_join(sql_query, join_table, new_table_col, orignal_table_col)
@@ -995,7 +995,7 @@ Reference - Exported functions 4 │ AG bb 2 0.7 AG Y 83
5 │ AI bb 4 0.9 AI X 95
#
TidierDB.@interpolate
— Macro.
@interpolate(args...)
@@ -1043,7 +1043,7 @@ Reference - Exported functions─────┼───────────────────────────
1 │ AA 1 0.1
#
TidierDB.@left_join
— Macro.
@left_join(sql_query, join_table, new_table_col, orignal_table_col)
@@ -1091,7 +1091,7 @@ Reference - Exported functions 9 │ AH aa 3 0.8 missing missing missing
10 │ AJ aa 5 1.0 missing missing missing
#
TidierDB.@mutate
— Macro.
@mutate(sql_query, exprs...)
@@ -1131,7 +1131,7 @@ Reference - Exported functions 9 │ AI bb 16 0.9 0.81
10 │ AJ aa 20 1.0 1.0
#
TidierDB.@rename
— Macro.
@rename(sql_query, renamings...)
@@ -1168,7 +1168,7 @@ Reference - Exported functions 9 │ AI bb 4 0.9
10 │ AJ aa 5 1.0
#
TidierDB.@right_join
— Macro.
@right_join(sql_query, join_table, new_table_col, orignal_table_col)
@@ -1213,7 +1213,7 @@ Reference - Exported functions 6 │ missing missing missing missing AK Y 68
7 │ missing missing missing missing AM X 74
#
TidierDB.@select
— Macro.
@select(sql_query, columns)
@@ -1272,7 +1272,7 @@ Reference - Exported functions 9 │ 4 0.9
10 │ 5 1.0
#
TidierDB.@semi_join
— Macro.
@semi_join(sql_query, join_table, new_table_col, orignal_table_col)
@@ -1315,7 +1315,7 @@ Reference - Exported functions 4 │ AG bb 2 0.7
5 │ AI bb 4 0.9
#
TidierDB.@slice_max
— Macro.
@slice_max(sql_query, column, n = 1)
@@ -1354,7 +1354,7 @@ Reference - Exported functions 1 │ AE bb 5 0.5 1
2 │ AJ aa 5 1.0 1
#
TidierDB.@slice_min
— Macro.
@slice_min(sql_query, column, n = 1)
@@ -1393,7 +1393,7 @@ Reference - Exported functions 1 │ AA bb 1 0.1 1
2 │ AF aa 1 0.6 1
#
TidierDB.@slice_sample
— Macro.
@slice_sample(sql_query, n)
@@ -1425,7 +1425,7 @@ Reference - Exported functions @collect
end;
#
TidierDB.@summarise
— Macro.
@summarise(sql_query, exprs...)
@@ -1472,7 +1472,7 @@ Reference - Exported functions 1 │ aa 3.0 5
2 │ bb 2.5 5
#
TidierDB.@summarize
— Macro.
@summarize(sql_query, exprs...)
@@ -1519,7 +1519,7 @@ Reference - Exported functions 1 │ aa 3.0 5
2 │ bb 2.5 5
#
TidierDB.@window_frame
— Macro.
@window_frame(sql_query, frame_start::Int, frame_end::Int)
@@ -1541,7 +1541,7 @@ Reference - Exported functionsjulia> copy_to(db, df, "df_mem");
#
TidierDB.@window_order
— Macro.
@window_order(sql_query, columns...)
@@ -1562,7 +1562,7 @@ Reference - Exported functionsjulia> copy_to(db, df, "df_mem");
TiderDB.jl is a 100% Julia implementation of the dbplyr R package, and similar to Python's ibis package.
The main goal of TidierDB.jl is to bring the syntax of Tidier.jl to multiple SQL backends, making it possible to analyze data directly on databases without needing to copy the entire database into memory.
"},{"location":"#currently-supported-backends-include","title":"Currently supported backends include:","text":"set_sql_mode(:duckdb)
set_sql_mode(:clickhouse)
set_sql_mode(:lite)
set_sql_mode(:mysql)
set_sql_mode(:mssql)
set_sql_mode(:postgres)
set_sql_mode(:athena)
set_sql_mode(:gbq)
set_sql_mode(:oracle)
The style of SQL that is generated can be modified using set_sql_mode()
.
For the stable version:
] add TidierDB\n
TidierDB.jl currently supports the following top-level macros:
@arrange
@group_by
@filter
@select
@mutate
, which supports across()
@summarize
and @summarise
, which supports across()
@distinct
@left_join
, @right_join
, @inner_join
, @anti_join
, @full_join
, and @semi_join
(slight syntax differences from TidierData.jl)@count
@slice_min
, @slice_max
, @slice_sample
@window_order
and window_frame
@show_query
@collect
Supported helper functions for most backends include:
across()
desc()
if_else()
and case_when()
n()
starts_with()
, ends_with()
, and contains()
as_float()
, as_integer()
, and as_string()
is_missing()
missing_if()
and replace_missing()
From TidierStrings.jl:
str_detect
, str_replace
, str_replace_all
, str_remove_all
, str_remove
From TidierDates.jl:
year
, month
, day
, hour
, min
, second
, floor_date
, difftime
Supported aggregate functions (as supported by the backend) with more to come
mean
, minimium
, maximum
, std
, sum
, cumsum
, cor
, cov
, var
@summarize
supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will workcopy_to
(for DuckDB, MySQL, SQLite)DuckDB specifically enables copy_to to directly reading in .parquet
, .json
, .csv
, and .arrow
file, including https file paths.
path = \"file_path.parquet\"\ncopy_to(conn, file_path, \"table_name\")\n
"},{"location":"#what-is-the-recommended-way-to-use-tidierdb","title":"What is the recommended way to use TidierDB?","text":"Typically, you will want to use TidierDB alongside TidierData because there are certain functionality (such as pivoting) which are only supported in TidierData and can only be performed on data frames.
Our recommended path for using TidierDB is to import the package so that there are no namespace conflicts with TidierData. Once TidierDB is integrated with Tidier, then Tidier will automatically load the packages in this fashion.
First, let's develop and execute a query using TidierDB. Notice that all top-level macros and functions originating from TidierDB start with a DB
prefix. Any functions defined within macros do not need to be prefixed within DB
because they are actually pseudofunctions that are in actuality converted into SQL code.
Even though the code reads similarly to TidierData, note that no computational work actually occurs until you run DB.@collect()
, which runs the SQL query and instantiates the result as a DataFrame.
using TidierData\nimport TidierDB as DB\n\ndb = DB.connect(:duckdb);\npath = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\nDB.copy_to(db, path, \"mtcars\");\n\n@chain DB.db_table(db, :mtcars) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@collect\nend\n
2\u00d75 DataFrame\n Row \u2502 cyl mpg mpg_squared mpg_rounded mpg_efficiency \n \u2502 Int64? Float64? Float64? Float64? String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 27.3444 747.719 27.0 efficient\n 2 \u2502 6 19.7333 389.404 20.0 moderate\n
"},{"location":"#what-if-we-wanted-to-pivot-the-result","title":"What if we wanted to pivot the result?","text":"We cannot do this using TidierDB. However, we can call @pivot_longer()
from TidierData after the result of the query has been instantiated as a DataFrame, like this:
@chain DB.db_table(db, :mtcars) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@collect\n @pivot_longer(everything(), names_to = \"variable\", values_to = \"value\")\nend\n
10\u00d72 DataFrame\n Row \u2502 variable value \n \u2502 String Any \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 cyl 4\n 2 \u2502 cyl 6\n 3 \u2502 mpg 27.3444\n 4 \u2502 mpg 19.7333\n 5 \u2502 mpg_squared 747.719\n 6 \u2502 mpg_squared 389.404\n 7 \u2502 mpg_rounded 27.0\n 8 \u2502 mpg_rounded 20.0\n 9 \u2502 mpg_efficiency efficient\n 10 \u2502 mpg_efficiency moderate\n
"},{"location":"#what-sql-query-does-tidierdb-generate-for-a-given-piece-of-julia-code","title":"What SQL query does TidierDB generate for a given piece of Julia code?","text":"We can replace DB.collect()
with DB.@show_query
to reveal the underlying SQL query being generated by TidierDB. To handle complex queries, TidierDB makes heavy use of Common Table Expressions (CTE), which are a useful tool to organize long queries.
@chain DB.db_table(db, :mtcars) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@show_query\nend\n
WITH cte_1 AS (\nSELECT *\n FROM mtcars\n WHERE NOT (starts_with(model, 'M'))),\ncte_2 AS (\nSELECT cyl, AVG(mpg) AS mpg\n FROM cte_1\n GROUP BY cyl),\ncte_3 AS (\nSELECT cyl, mpg, POWER(mpg, 2) AS mpg_squared, ROUND(mpg) AS mpg_rounded, CASE WHEN mpg >= POWER(cyl, 2) THEN 'efficient' WHEN mpg < 15.2 THEN 'inefficient' ELSE 'moderate' END AS mpg_efficiency\n FROM cte_2 ),\ncte_4 AS (\nSELECT *\n FROM cte_3\n WHERE mpg_efficiency in ('moderate', 'efficient')) \nSELECT *\n FROM cte_4 \n ORDER BY mpg_rounded DESC\n
"},{"location":"#tidierdb-is-already-quite-fully-featured-supporting-advanced-tidierdata-functions-like-across-for-multi-column-selection","title":"TidierDB is already quite fully-featured, supporting advanced TidierData functions like across()
for multi-column selection.","text":"@chain DB.db_table(db, :mtcars) begin\n DB.@group_by(cyl)\n DB.@summarize(across((starts_with(\"a\"), ends_with(\"s\")), (mean, sum)))\n DB.@collect\nend\n
3\u00d75 DataFrame\n Row \u2502 cyl mean_am mean_vs sum_am sum_vs \n \u2502 Int64? Float64? Float64? Int128? Int128? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 0.727273 0.909091 8 10\n 2 \u2502 6 0.428571 0.571429 3 4\n 3 \u2502 8 0.142857 0.0 2 0\n
Bang bang !!
interpolation for columns and values is also supported.
There are a few subtle but important differences from Tidier.jl outlined here.
"},{"location":"#missing-a-function-or-backend","title":"Missing a function or backend?","text":"You can use any existing SQL function within @mutate
with the correct SQL syntax and it should just work.
But if you run into problems please open an issue, and we will be happy to take a look!
"},{"location":"reference/","title":"Reference","text":""},{"location":"reference/#index","title":"Index","text":"TidierDB.connect
TidierDB.copy_to
TidierDB.@anti_join
TidierDB.@arrange
TidierDB.@count
TidierDB.@distinct
TidierDB.@filter
TidierDB.@full_join
TidierDB.@group_by
TidierDB.@inner_join
TidierDB.@interpolate
TidierDB.@left_join
TidierDB.@mutate
TidierDB.@rename
TidierDB.@right_join
TidierDB.@select
TidierDB.@semi_join
TidierDB.@slice_max
TidierDB.@slice_min
TidierDB.@slice_sample
TidierDB.@summarise
TidierDB.@summarize
TidierDB.@window_frame
TidierDB.@window_order
# TidierDB.connect
\u2014 Method.
connect(backend::Symbol; kwargs...)\n
This function establishes a database connection based on the specified backend and connection parameters and sets the SQL mode
Arguments
backend
: A symbol specifying the database backend to connect to. Supported backends are:
:duckdb
, :lite
(SQLite), :mssql
, mysql
(for MariaDB and MySQL), :clickhouse
, :postgres
kwargs
: Keyword arguments specifying the connection parameters for the selected backend. The required parameters vary depending on the backend:
MySQL:
host
: The host name or IP address of the MySQL server. Default is \"localhost\".user
: The username for authentication. Default is an empty string.password
: The password for authentication.db
: The name of the database to connect to (optional).port
: The port number of the MySQL server (optional).Returns
Examples
**Connect to MySQL**\n\n**conn = connect(:mysql; host=\"localhost\", user=\"root\", password=\"password\", db=\"mydb\")**\n\n**Connect to PostgreSQL using LibPQ**\n\n**conn = connect(:postgres; host=\"localhost\", dbname=\"mydb\", user=\"postgres\", password=\"password\")**\n\n**Connect to ClickHouse**\n\n**conn = connect(:clickhouse; host=\"localhost\", port=9000, database=\"mydb\", user=\"default\", password=\"\")**\n\n**Connect to SQLite**\n\n**conn = connect(:lite)**\n\n**Connect to Google Big Query**\n\n**conn = connect(:gbq, \"json*user*key*path\", \"project*id\")**\n\n**Connect to DuckDB**\n\njulia> db = connect(:duckdb) DuckDB.Connection(\":memory:\")\n\n\n<a target='_blank' href='https://github.com/TidierOrg/TidierDB.jl/blob/14b0ddb971f3ea6c18ece5470584839361e63471/src/TidierDB.jl#L324-L358' class='documenter-source'>source</a><br>\n\n<a id='TidierDB.copy_to-Tuple{Any, Union{AbstractString, DataFrame}, String}' href='#TidierDB.copy_to-Tuple{Any, Union{AbstractString, DataFrame}, String}'>#</a>\n**`TidierDB.copy_to`** — *Method*.\n\n\n\n```julia\n copy_to(conn, df_or_path, \"name\")\n
Allows user to copy a df to the database connection. Currently supports DuckDB, SQLite, MySql
Arguments
-conn
: the database connection -df
: dataframe to be copied or path to serve as source. With DuckDB, path supports .csv, .json, .parquet to be used without copying intermediary df. -name
: name as string for the database to be used
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"test\");\n
source
# TidierDB.@anti_join
\u2014 Macro.
@anti_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform an anti join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @anti_join(df_join, id2, id)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AB aa 2 0.2\n 2 \u2502 AD aa 4 0.4\n 3 \u2502 AF aa 1 0.6\n 4 \u2502 AH aa 3 0.8\n 5 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@arrange
\u2014 Macro.
@arrange(sql_query, columns...)\n
Order SQL table rows based on specified column(s).
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to order the rows by. Can include multiple columns for nested sorting. Wrap column name with desc()
for descending order.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @arrange(value, desc(percent))\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AF aa 1 0.6\n 2 \u2502 AA bb 1 0.1\n 3 \u2502 AG bb 2 0.7\n 4 \u2502 AB aa 2 0.2\n 5 \u2502 AH aa 3 0.8\n 6 \u2502 AC bb 3 0.3\n 7 \u2502 AI bb 4 0.9\n 8 \u2502 AD aa 4 0.4\n 9 \u2502 AJ aa 5 1.0\n 10 \u2502 AE bb 5 0.5\n
source
# TidierDB.@count
\u2014 Macro.
@count(sql_query, columns...)\n
Count the number of rows grouped by specified column(s).
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to group by before counting. If no columns are specified, counts all rows in the query.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @count(groups)\n @arrange(groups)\n @collect\n end\n2\u00d72 DataFrame\n Row \u2502 groups count \n \u2502 String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 5\n 2 \u2502 bb 5\n
source
# TidierDB.@distinct
\u2014 Macro.
@distinct(sql_query, columns...)\n
Select distinct rows based on specified column(s). Distinct works differently in TidierData vs SQL and therefore TidierDB. Distinct will also select only the only columns it is given (or all if given none)
Arguments
sql_query
: The SQL query to operate on. columns
: Columns to determine uniqueness. If no columns are specified, all columns are used to identify distinct rows.
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @distinct(value)\n @arrange(value)\n @collect\n end\n5\u00d71 DataFrame\n Row \u2502 value \n \u2502 Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1\n 2 \u2502 2\n 3 \u2502 3\n 4 \u2502 4\n 5 \u2502 5\n\njulia> @chain db_table(db, :df_mem) begin\n @distinct\n @arrange(id)\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@filter
\u2014 Macro.
@filter(sql_query, conditions...)\n
Filter rows in a SQL table based on specified conditions.
Arguments
sql_query
: The SQL query to filter rows from.conditions
: Expressions specifying the conditions that rows must satisfy to be included in the output. Rows for which the expression evaluates to true
will be included in the result. Multiple conditions can be combined using logical operators (&&
, ||
). It will automatically detect whether the conditions belong in WHERE vs HAVING.
Temporarily, it is best to use begin and end when filtering multiple conditions. (ex 2 below)\n
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @filter(percent > .5)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AF aa 1 0.6\n 2 \u2502 AG bb 2 0.7\n 3 \u2502 AH aa 3 0.8\n 4 \u2502 AI bb 4 0.9\n 5 \u2502 AJ aa 5 1.0\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(mean = mean(percent))\n @filter begin \n groups == \"bb\" || # logical operators can still be used like this\n mean > .5\n end\n @arrange(groups)\n @collect\n end\n2\u00d72 DataFrame\n Row \u2502 groups mean \n \u2502 String? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 0.6\n 2 \u2502 bb 0.5\n
source
# TidierDB.@full_join
\u2014 Macro.
@inner_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform an full join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @full_join(df_join, id2, id)\n @collect\n end\n12\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 AB aa 2 0.2 missing missing missing \n 7 \u2502 AD aa 4 0.4 missing missing missing \n 8 \u2502 AF aa 1 0.6 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n 11 \u2502 missing missing missing missing AK Y 68\n 12 \u2502 missing missing missing missing AM X 74\n
source
# TidierDB.@group_by
\u2014 Macro.
@group_by(sql_query, columns...)\n
Group SQL table rows by specified column(s). If grouping is performed as a terminal operation without a subsequent mutatation or summarization (as in the example below), then the resulting data frame will be ungrouped when @collect
is applied.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions specifying the columns to group by. Columns can be specified by name.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @arrange(groups)\n @collect\n end\n2\u00d71 DataFrame\n Row \u2502 groups \n \u2502 String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa\n 2 \u2502 bb\n
source
# TidierDB.@inner_join
\u2014 Macro.
@inner_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform an inner join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @inner_join(df_join, id2, id)\n @collect\n end\n5\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n
source
# TidierDB.@interpolate
\u2014 Macro.
@interpolate(args...)\n
Interpolate parameters into expressions for database queries.
Arguments
args...
: A variable number of tuples. Each tuple should contain:
name
: The name of the parameter to interpolate.value
: (Any): The value/vector to interpolate for the corresponding parameter name.Example
julia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> col_names = [:id, :value, :percent];\n\njulia> cond1 = .2;\n\njulia> cond2 = 5;\n\njulia> @interpolate((condition1, cond1), (columns, col_names), (condition2, cond2));\n\njulia> @chain db_table(db, \"df_mem\") begin \n @select(!!columns)\n @filter begin \n percent < !!condition1\n value < !!condition2\n end\n @collect\n end\n1\u00d73 DataFrame\n Row \u2502 id value percent \n \u2502 String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA 1 0.1\n
source
# TidierDB.@left_join
\u2014 Macro.
@left_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform a left join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @left_join(df_join, id2, id)\n @collect\n end\n10\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 AB aa 2 0.2 missing missing missing \n 7 \u2502 AD aa 4 0.4 missing missing missing \n 8 \u2502 AF aa 1 0.6 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n
source
# TidierDB.@mutate
\u2014 Macro.
@mutate(sql_query, exprs...)\n
Mutate SQL table rows by adding new columns or modifying existing ones.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions for mutating the table. New columns can be added or existing columns modified using column_name = expression syntax, where expression can involve existing columns.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @mutate(value = value * 4, new_col = percent^2)\n @collect\n end\n10\u00d75 DataFrame\n Row \u2502 id groups value percent new_col \n \u2502 String? String? Int64? Float64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 4 0.1 0.01\n 2 \u2502 AB aa 8 0.2 0.04\n 3 \u2502 AC bb 12 0.3 0.09\n 4 \u2502 AD aa 16 0.4 0.16\n 5 \u2502 AE bb 20 0.5 0.25\n 6 \u2502 AF aa 4 0.6 0.36\n 7 \u2502 AG bb 8 0.7 0.49\n 8 \u2502 AH aa 12 0.8 0.64\n 9 \u2502 AI bb 16 0.9 0.81\n 10 \u2502 AJ aa 20 1.0 1.0\n
source
# TidierDB.@rename
\u2014 Macro.
@rename(sql_query, renamings...)\n
Rename one or more columns in a SQL query.
Arguments
-sql_query
: The SQL query to operate on. -renamings
: One or more pairs of old and new column names, specified as new name = old name
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @rename(new_name = percent)\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value new_name \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@right_join
\u2014 Macro.
@right_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform a right join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @right_join(df_join, id2, id)\n @collect\n end\n7\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 missing missing missing missing AK Y 68\n 7 \u2502 missing missing missing missing AM X 74\n
source
# TidierDB.@select
\u2014 Macro.
@select(sql_query, columns)\n
Select specified columns from a SQL table.
Arguments
sql_query
: The SQL query to select columns from.columns
: Expressions specifying the columns to select. Columns can be specified by name, and new columns can be created with expressions using existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @select(groups:percent)\n @collect\n end\n10\u00d73 DataFrame\n Row \u2502 groups value percent \n \u2502 String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 bb 1 0.1\n 2 \u2502 aa 2 0.2\n 3 \u2502 bb 3 0.3\n 4 \u2502 aa 4 0.4\n 5 \u2502 bb 5 0.5\n 6 \u2502 aa 1 0.6\n 7 \u2502 bb 2 0.7\n 8 \u2502 aa 3 0.8\n 9 \u2502 bb 4 0.9\n 10 \u2502 aa 5 1.0\n\njulia> @chain db_table(db, :df_mem) begin\n @select(contains(\"e\"))\n @collect\n end\n10\u00d72 DataFrame\n Row \u2502 value percent \n \u2502 Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 0.1\n 2 \u2502 2 0.2\n 3 \u2502 3 0.3\n 4 \u2502 4 0.4\n 5 \u2502 5 0.5\n 6 \u2502 1 0.6\n 7 \u2502 2 0.7\n 8 \u2502 3 0.8\n 9 \u2502 4 0.9\n 10 \u2502 5 1.0\n
source
# TidierDB.@semi_join
\u2014 Macro.
@semi_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform an semi join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @semi_join(df_join, id2, id)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AC bb 3 0.3\n 3 \u2502 AE bb 5 0.5\n 4 \u2502 AG bb 2 0.7\n 5 \u2502 AI bb 4 0.9\n
source
# TidierDB.@slice_max
\u2014 Macro.
@slice_max(sql_query, column, n = 1)\n
Select rows with the largest values in specified column. This will always return ties.
Arguments
sql_query
: The SQL query to operate on.column
: Column to identify the smallest values.n
: The number of rows to select with the largest values for each specified column. Default is 1, which selects the row with the smallest value.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_max(value, n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_max(value)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 id groups value percent rank_col \n \u2502 String? String? Int64? Float64? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AE bb 5 0.5 1\n 2 \u2502 AJ aa 5 1.0 1\n
source
# TidierDB.@slice_min
\u2014 Macro.
@slice_min(sql_query, column, n = 1)\n
Select rows with the smallest values in specified column. This will always return ties.
Arguments
sql_query
: The SQL query to operate on.column
: Column to identify the smallest values.n
: The number of rows to select with the smallest values for each specified column. Default is 1, which selects the row with the smallest value.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_min(value, n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_min(value)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 id groups value percent rank_col \n \u2502 String? String? Int64? Float64? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 1\n 2 \u2502 AF aa 1 0.6 1\n
source
# TidierDB.@slice_sample
\u2014 Macro.
@slice_sample(sql_query, n)\n
Randomly select a specified number of rows from a SQL table.
Arguments
sql_query
: The SQL query to operate on.n
: The number of rows to randomly select.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_sample(n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_sample()\n @collect\n end;\n
source
# TidierDB.@summarise
\u2014 Macro.
@summarise(sql_query, exprs...)\n
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(across((value:percent), (mean, sum)))\n @arrange(groups)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 groups mean_value mean_percent sum_value sum_percent \n \u2502 String? Float64? Float64? Int128? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 0.6 15 3.0\n 2 \u2502 bb 3.0 0.5 15 2.5\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(test = sum(percent), n = n())\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String? Float64? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n
source
# TidierDB.@summarize
\u2014 Macro.
@summarize(sql_query, exprs...)\n
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(across((ends_with(\"e\"), starts_with(\"p\")), (mean, sum)))\n @arrange(groups)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 groups mean_value mean_percent sum_value sum_percent \n \u2502 String? Float64? Float64? Int128? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 0.6 15 3.0\n 2 \u2502 bb 3.0 0.5 15 2.5\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(test = sum(percent), n = n())\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String? Float64? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n
source
# TidierDB.@window_frame
\u2014 Macro.
@window_frame(sql_query, frame_start::Int, frame_end::Int)\n
Define the window frame for window functions in a SQL query, specifying the range of rows to include in the calculation relative to the current row.
Arguments
sql_query: The SQL query to operate on, expected to be an instance of SQLQuery.
frame_start
: The starting point of the window frame. A positive value indicates the start after the current row (FOLLOWING), a negative value indicates before the current row (PRECEDING), and 0 indicates the current row.frame_end
: The ending point of the window frame. A positive value indicates the end after the current row (FOLLOWING), a negative value indicates before the current row (PRECEDING), and 0 indicates the current row.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n
source
# TidierDB.@window_order
\u2014 Macro.
@window_order(sql_query, columns...)\n
Specify the order of rows for window functions within a SQL query.
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to order the rows by for the window function. Can include multiple columns for nested sorting. Prepend a column name with - for descending order.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n
source
"},{"location":"reference/#reference-internal-functions","title":"Reference - Internal functions","text":""},{"location":"examples/generated/UserGuide/athena/","title":"Using Athena","text":"To use the Athena AWS backend with TidierDB, set up and a small syntax difference are covered here.
"},{"location":"examples/generated/UserGuide/athena/#connecting","title":"Connecting","text":"Connection is established through AWS.jl as shwon below.
using TidierDB, AWS\nset_sql_mode(:athena)\n# Replace your credentials as needed below\naws_access_key_id = get(ENV,\"AWS_ACCESS_KEY_ID\",\"key\")\naws_secret_access_key = get(ENV, \"AWS_SECRET_ACCESS_KEY\",\"secret_key\")\naws_region = get(ENV,\"AWS_DEFAULT_REGION\",\"region\")\n\nconst AWS_GLOBAL_CONFIG = Ref{AWS.AWSConfig}()\ncreds = AWSCredentials(aws_access_key_id, aws_secret_access_key)\n\nAWS_GLOBAL_CONFIG[] = AWS.global_aws_config(region=aws_region, creds=creds)\n\ncatalog = \"AwsDataCatalog\"\nworkgroup = \"primary\"\ndb = \"demodb\"\nall_results = true\nresults_per_increment = 10\nout_loc = \"s3://location/\"\n\nathena_params = Dict(\n \"ResultConfiguration\" => Dict(\n \"OutputLocation\" => out_loc\n ),\n \"QueryExecutionContext\" => Dict(\n \"Database\" => db,\n \"Catalog\" => catalog\n ),\n \"Workgroup\" => workgroup\n)\n
"},{"location":"examples/generated/UserGuide/athena/#db_table-differences","title":"db_table
differences","text":"There are two differences for db_table
which are seen in the query below
\"demodb.table_name
db_table
requires a third argument: the athena_params from above.from_query
with Athena to reduce number of queries","text":"Throughout TidierDB, each time db_table
is called, it queries the databases to get the metadata. Consider how AWS Athena logs queries, a user may want to reduce the number of queries. This can be done saving the results of db_table
, and then using from_query with those results for furthe queries as shown below.
mtcars = db_table(AWS_GLOBAL_CONFIG[], \"demodb.mtcars\", athena_params)\n@chain from_query(mtcars) begin\n @filter(cyl > 4)\n @group_by(cyl)\n @summarize(mpg = mean(mpg))\n #@show_query\n @collect\nend\n
2\u00d72 DataFrame\n Row \u2502 cyl mpg\n \u2502 Int64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 6 19.7429\n 2 \u2502 8 15.1\n
I would like to acknowledge the work of Manu Francis and this blog post, which helped guide this process
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/from_queryex/","title":"Reusing Part of a Query","text":"While using TidierDB, you may need to generate part of a query and reuse it multiple times. from_query()
enables a query portion to be reused multiple times as shown below.
import TidierDB as DB\ncon = DB.connect(:duckdb)\nDB.copy_to(con, \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\", \"mtcars2\")\n
Start a query to analyze fuel efficiency by number of cylinders. However, to further build on this query later, end the chain without using @show_query
or @collect
query = DB.@chain DB.db_table(con, :mtcars2) begin\n DB.@group_by cyl\n DB.@summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n DB.@mutate begin\n efficiency = case_when(\n mean_mpg >= 25, \"High\",\n mean_mpg >= 15, \"Moderate\",\n \"Low\" )\n end\nend;\n
Now, from_query
will allow you to reuse the query to calculate the average horsepower for each efficiency category
DB.@chain DB.from_query(query) begin\n DB.@left_join(mtcars2, cyl, cyl)\n DB.@group_by(efficiency)\n DB.@summarize(avg_hp = mean(hp))\n DB.@collect\nend\n
2\u00d72 DataFrame\n Row \u2502 efficiency avg_hp\n \u2502 String? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Moderate 180.238\n 2 \u2502 High 82.6364\n
Reuse the query again to find the car with the highest MPG for each cylinder category
DB.@chain DB.from_query(query) begin\n DB.@left_join(mtcars2, cyl, cyl)\n DB.@group_by cyl\n DB.@slice_max(mpg)\n DB.@select model cyl mpg\n DB.@collect\nend\n
3\u00d73 DataFrame\n Row \u2502 model cyl mpg\n \u2502 String? Int64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Pontiac Firebird 8 19.2\n 2 \u2502 Toyota Corolla 4 33.9\n 3 \u2502 Hornet 4 Drive 6 21.4\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/functions_pass_to_DB/","title":"Writing Functions/Macros with TidierDB Chains","text":"How can functions pass arguments to a TidierDB chain?
In short, you have to use a macro instead in conjuction with @interpolate
To write a macro that will take arguments and pass them to a TidierDB chain, there are 3 steps:
!!
@interpolate
to make these arguemnts accessible to the chain. @interpolate
takes touples as argument (one for the !!
name, and one for the actual content you want the chain to use)@interpolate
and then the chain macro sequentiallyusing TidierDB\npath = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\ncopy_to(db, path, \"mtcars\");\n\n# STEP 1\nmacro f1(conditions, columns) # The arguemnt names will be names of the `!!` values\n return quote\n # add chain here\n @chain db_table(db, :mtcars) begin\n @filter(!!conditions > 3)\n @select(!!columns)\n @aside @show_query _\n @collect\n end # ends the chain\n end # ends the quote.\nend # ends the macro\n
# STEP 2\nvariable = :gear;\ncols = [:model, :mpg, :gear, :wt];\n@interpolate((conditions, variable), (columns, cols));\n@f1(variable, cols)\n
17\u00d74 DataFrame\n Row \u2502 model mpg gear wt\n \u2502 String? Float64? Int32? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 21.0 4 2.62\n 2 \u2502 Mazda RX4 Wag 21.0 4 2.875\n 3 \u2502 Datsun 710 22.8 4 2.32\n \u22ee \u2502 \u22ee \u22ee \u22ee \u22ee\n 15 \u2502 Ferrari Dino 19.7 5 2.77\n 16 \u2502 Maserati Bora 15.0 5 3.57\n 17 \u2502 Volvo 142E 21.4 4 2.78\n 11 rows omitted\n
Lets say you wanted to filter on new variable with a different name and select new columns,
new_condition = :wt;\nnew_cols = [:model, :drat]\n@interpolate((conditions, new_condition), (columns, new_cols));\n@f1(new_condition, new_cols)\n
20\u00d72 DataFrame\n Row \u2502 model drat\n \u2502 String? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Hornet 4 Drive 3.08\n 2 \u2502 Hornet Sportabout 3.15\n 3 \u2502 Valiant 2.76\n \u22ee \u2502 \u22ee \u22ee\n 18 \u2502 Pontiac Firebird 3.08\n 19 \u2502 Ford Pantera L 4.22\n 20 \u2502 Maserati Bora 3.54\n 14 rows omitted\n
You can also interpolate vectors of strings into a @filter(col in (values))
as well by using the following syntax @filter(col in [!!values])
In short, the first argument in @interpolate
must be the name of the macro argument it refers to, and the second argument is what you would like to replace it.
We recognize this adds friction and that it is not ideal, but given the TidierDB macro expressions/string interplay, this is currently the most graceful and functional option available and hopefully a temporary solution to better interpolation that mirrors TidierData.jl.
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/getting_started/","title":"Getting Started","text":"To use TidierDB.jl, you will have to set up a connection. TidierDB.jl gives you access to duckdb via duckdb_open
and duckdb_connect
. However, to use MySql, ClickHouse, MSSQL, Postgres, or SQLite, you will have to load those packages in first.
If you plan to use TidierDB.jl with TidierData.jl or Tidier.jl, it is most convenenient to load the packages as follows:
using TidierData\nimport TidierDB as DB\n
Alternatively, using Tidier
will import TidierDB in the above manner for you, where TidierDB functions and macros will be available as DB.@mutate()
and so on, and the TidierData equivalent would be @mutate()
.
There are two ways to connect to the database. you can use connect
without any need to load any additional packages.
For example Connecting to MySQL
conn = connect(:mysql; host=\"localhost\", user=\"root\", password=\"password\", db=\"mydb\")\n
versus connecting to DuckDB
conn = connect(:duckdb)\n
connect()
and will require using the respective libraries outlined below to establish a connectionAlternatively, you can use the packages outlined below to establish a connection through their respective methods.
For DuckDB, SQLite, and MySQL, copy_to()
lets you copy data to the database and query there. ClickHouse, MSSQL, and Postgres support for copy_to()
has not been added yet.
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/key_differences/","title":"Key Differences from TidierData.jl","text":"There are a few important syntax and behavior differences between TidierDB.jl and TidierData.jl outlined below.
"},{"location":"examples/generated/UserGuide/key_differences/#creating-a-database","title":"Creating a database","text":"For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MariaDB, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database.
using DataFrames, TidierDB\n\ndf = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9],\n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10],\n value = repeat(1:5, 2),\n percent = 0.1:0.1:1.0);\n\ndb = connect(:duckdb);\n\ncopy_to(db, df, \"df_mem\"); # copying over the data frame to an in-memory database\n
"},{"location":"examples/generated/UserGuide/key_differences/#row-ordering","title":"Row ordering","text":"DuckDB benefits from aggressive parallelization of pipelines. This means that if you have multiple threads enabled in Julia, which you can check or set using Threads.nthreads()
, DuckDB will use multiple threads. However, because many operations are multi-threaded, the resulting row order is inconsistent. If row order needs to be deterministic for your use case, make sure to apply an @arrange(column_name_1, column_name_2, etc...)
prior to collecting the results.
When using TidierDB, db_table(connection, :table_name)
is used to start a chain.
In TidierDB, when performing @group_by
then @mutate
, the table will be ungrouped after applying all of the mutations in the clause to the grouped data. To perform subsequent grouped operations, the user would have to regroup the data. This is demonstrated below.
@chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarize(mean_percent = mean(percent))\n @collect\n end\n
2\u00d72 DataFrame Rowgroupsmean_percentString?Float64?1bb0.52aa0.6 Regrouping following @mutate
@chain db_table(db, :df_mem) begin\n @group_by(groups)\n @mutate(max = maximum(percent), min = minimum(percent))\n @group_by(groups)\n @summarise(mean_percent = mean(percent))\n @collect\nend\n
2\u00d72 DataFrame Rowgroupsmean_percentString?Float64?1bb0.52aa0.6 "},{"location":"examples/generated/UserGuide/key_differences/#joining","title":"Joining","text":"There is one key difference for joining:
The column on both the new and old table must be specified. They do not need to be the same, and given SQL behavior where both columns are kept when joining two tables, it is preferable if they have different names. This avoids \"ambiguous reference\" errors that would otherwise come up and complicate the use of tidy selection for columns. Athena has an additional slight difference given the need for parameters, which is covered in the Athena documentation page.
df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\ncopy_to(db, df2, \"df_join\");\n\n@chain db_table(db, :df_mem) begin\n @left_join(df_join, id2, id)\n @collect\nend\n
10\u00d77 DataFrame Rowidgroupsvaluepercentid2categoryscoreString?String?Int64?Float64?String?String?Int64?1AAbb10.1AAX882ACbb30.3ACY923AEbb50.5AEX774AGbb20.7AGY835AIbb40.9AIX956ABaa20.2missingmissingmissing7ADaa40.4missingmissingmissing8AFaa10.6missingmissingmissing9AHaa30.8missingmissingmissing10AJaa51.0missingmissingmissing "},{"location":"examples/generated/UserGuide/key_differences/#differences-in-case_when","title":"Differences in case_when()
","text":"In TidierDB, after the clause is completed, the result for the new column should is separated by a comma ,
in contrast to TidierData.jl, where the result for the new column is separated by a =>
.
@chain db_table(db, :df_mem) begin\n @mutate(new_col = case_when(percent > .5, \"Pass\", # in TidierData, percent > .5 => \"Pass\",\n percent <= .5, \"Try Again\", # percent <= .5 => \"Try Again\"\n true, \"middle\"))\n @collect\n end\n
10\u00d75 DataFrame Rowidgroupsvaluepercentnew_colString?String?Int64?Float64?String?1AAbb10.1Try Again2ABaa20.2Try Again3ACbb30.3Try Again4ADaa40.4Try Again5AEbb50.5Try Again6AFaa10.6Pass7AGbb20.7Pass8AHaa30.8Pass9AIbb40.9Pass10AJaa51.0Pass "},{"location":"examples/generated/UserGuide/key_differences/#interpolation","title":"Interpolation","text":"To use !! Interpolation, instead of being able to define the alternate names/value in the global context, the user has to use @interpolate
. This will hopefully be fixed in future versions. Otherwise, the behavior is generally the same, although this creates friction around calling functions.
Also, when using interpolation with exponenents, the interpolated value must go inside of parenthesis.
@interpolate((test, :percent)); # this still supports strings, vectors of names, and values\n\n@chain db_table(db, :df_mem) begin\n @mutate(new_col = case_when((!!test)^2 > .5, \"Pass\",\n (!!test)^2 < .5, \"Try Again\",\n \"middle\"))\n @collect\nend\n
10\u00d75 DataFrame\n Row \u2502 id groups value percent new_col\n \u2502 String? String? Int64? Float64? String?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 Try Again\n 2 \u2502 AB aa 2 0.2 Try Again\n 3 \u2502 AC bb 3 0.3 Try Again\n \u22ee \u2502 \u22ee \u22ee \u22ee \u22ee \u22ee\n 8 \u2502 AH aa 3 0.8 Pass\n 9 \u2502 AI bb 4 0.9 Pass\n 10 \u2502 AJ aa 5 1.0 Pass\n 4 rows omitted\n
"},{"location":"examples/generated/UserGuide/key_differences/#slicing-ties","title":"Slicing ties","text":"slice_min()
and @slice_max()
will always return ties due to SQL behavior.
This page was generated using Literate.jl.
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":""},{"location":"#what-is-tidierdbjl","title":"What is TidierDB.jl?","text":"TiderDB.jl is a 100% Julia implementation of the dbplyr R package, and similar to Python's ibis package.
The main goal of TidierDB.jl is to bring the syntax of Tidier.jl to multiple SQL backends, making it possible to analyze data directly on databases without needing to copy the entire database into memory.
"},{"location":"#currently-supported-backends-include","title":"Currently supported backends include:","text":"set_sql_mode(:duckdb)
set_sql_mode(:clickhouse)
set_sql_mode(:lite)
set_sql_mode(:mysql)
set_sql_mode(:mssql)
set_sql_mode(:postgres)
set_sql_mode(:athena)
set_sql_mode(:gbq)
set_sql_mode(:oracle)
The style of SQL that is generated can be modified using set_sql_mode()
.
For the stable version:
] add TidierDB\n
TidierDB.jl currently supports the following top-level macros:
@arrange
@group_by
@filter
@select
@mutate
, which supports across()
@summarize
and @summarise
, which supports across()
@distinct
@left_join
, @right_join
, @inner_join
, @anti_join
, @full_join
, and @semi_join
(slight syntax differences from TidierData.jl)@count
@slice_min
, @slice_max
, @slice_sample
@window_order
and window_frame
@show_query
@collect
Supported helper functions for most backends include:
across()
desc()
if_else()
and case_when()
n()
starts_with()
, ends_with()
, and contains()
as_float()
, as_integer()
, and as_string()
is_missing()
missing_if()
and replace_missing()
From TidierStrings.jl:
str_detect
, str_replace
, str_replace_all
, str_remove_all
, str_remove
From TidierDates.jl:
year
, month
, day
, hour
, min
, second
, floor_date
, difftime
Supported aggregate functions (as supported by the backend) with more to come
mean
, minimium
, maximum
, std
, sum
, cumsum
, cor
, cov
, var
@summarize
supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will workcopy_to
(for DuckDB, MySQL, SQLite)DuckDB specifically enables copy_to to directly reading in .parquet
, .json
, .csv
, and .arrow
file, including https file paths.
path = \"file_path.parquet\"\ncopy_to(conn, file_path, \"table_name\")\n
"},{"location":"#what-is-the-recommended-way-to-use-tidierdb","title":"What is the recommended way to use TidierDB?","text":"Typically, you will want to use TidierDB alongside TidierData because there are certain functionality (such as pivoting) which are only supported in TidierData and can only be performed on data frames.
Our recommended path for using TidierDB is to import the package so that there are no namespace conflicts with TidierData. Once TidierDB is integrated with Tidier, then Tidier will automatically load the packages in this fashion.
First, let's develop and execute a query using TidierDB. Notice that all top-level macros and functions originating from TidierDB start with a DB
prefix. Any functions defined within macros do not need to be prefixed within DB
because they are actually pseudofunctions that are in actuality converted into SQL code.
Even though the code reads similarly to TidierData, note that no computational work actually occurs until you run DB.@collect()
, which runs the SQL query and instantiates the result as a DataFrame.
using TidierData\nimport TidierDB as DB\n\ndb = DB.connect(:duckdb);\npath = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\nDB.copy_to(db, path, \"mtcars\");\n\n@chain DB.db_table(db, :mtcars) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@collect\nend\n
2\u00d75 DataFrame\n Row \u2502 cyl mpg mpg_squared mpg_rounded mpg_efficiency \n \u2502 Int64? Float64? Float64? Float64? String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 27.3444 747.719 27.0 efficient\n 2 \u2502 6 19.7333 389.404 20.0 moderate\n
"},{"location":"#what-if-we-wanted-to-pivot-the-result","title":"What if we wanted to pivot the result?","text":"We cannot do this using TidierDB. However, we can call @pivot_longer()
from TidierData after the result of the query has been instantiated as a DataFrame, like this:
@chain DB.db_table(db, :mtcars) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@collect\n @pivot_longer(everything(), names_to = \"variable\", values_to = \"value\")\nend\n
10\u00d72 DataFrame\n Row \u2502 variable value \n \u2502 String Any \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 cyl 4\n 2 \u2502 cyl 6\n 3 \u2502 mpg 27.3444\n 4 \u2502 mpg 19.7333\n 5 \u2502 mpg_squared 747.719\n 6 \u2502 mpg_squared 389.404\n 7 \u2502 mpg_rounded 27.0\n 8 \u2502 mpg_rounded 20.0\n 9 \u2502 mpg_efficiency efficient\n 10 \u2502 mpg_efficiency moderate\n
"},{"location":"#what-sql-query-does-tidierdb-generate-for-a-given-piece-of-julia-code","title":"What SQL query does TidierDB generate for a given piece of Julia code?","text":"We can replace DB.collect()
with DB.@show_query
to reveal the underlying SQL query being generated by TidierDB. To handle complex queries, TidierDB makes heavy use of Common Table Expressions (CTE), which are a useful tool to organize long queries.
@chain DB.db_table(db, :mtcars) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@show_query\nend\n
WITH cte_1 AS (\nSELECT *\n FROM mtcars\n WHERE NOT (starts_with(model, 'M'))),\ncte_2 AS (\nSELECT cyl, AVG(mpg) AS mpg\n FROM cte_1\n GROUP BY cyl),\ncte_3 AS (\nSELECT cyl, mpg, POWER(mpg, 2) AS mpg_squared, ROUND(mpg) AS mpg_rounded, CASE WHEN mpg >= POWER(cyl, 2) THEN 'efficient' WHEN mpg < 15.2 THEN 'inefficient' ELSE 'moderate' END AS mpg_efficiency\n FROM cte_2 ),\ncte_4 AS (\nSELECT *\n FROM cte_3\n WHERE mpg_efficiency in ('moderate', 'efficient')) \nSELECT *\n FROM cte_4 \n ORDER BY mpg_rounded DESC\n
"},{"location":"#tidierdb-is-already-quite-fully-featured-supporting-advanced-tidierdata-functions-like-across-for-multi-column-selection","title":"TidierDB is already quite fully-featured, supporting advanced TidierData functions like across()
for multi-column selection.","text":"@chain DB.db_table(db, :mtcars) begin\n DB.@group_by(cyl)\n DB.@summarize(across((starts_with(\"a\"), ends_with(\"s\")), (mean, sum)))\n DB.@collect\nend\n
3\u00d75 DataFrame\n Row \u2502 cyl mean_am mean_vs sum_am sum_vs \n \u2502 Int64? Float64? Float64? Int128? Int128? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 0.727273 0.909091 8 10\n 2 \u2502 6 0.428571 0.571429 3 4\n 3 \u2502 8 0.142857 0.0 2 0\n
Bang bang !!
interpolation for columns and values is also supported.
There are a few subtle but important differences from Tidier.jl outlined here.
"},{"location":"#missing-a-function-or-backend","title":"Missing a function or backend?","text":"You can use any existing SQL function within @mutate
with the correct SQL syntax and it should just work.
But if you run into problems please open an issue, and we will be happy to take a look!
"},{"location":"reference/","title":"Reference","text":""},{"location":"reference/#index","title":"Index","text":"TidierDB.connect
TidierDB.copy_to
TidierDB.@anti_join
TidierDB.@arrange
TidierDB.@count
TidierDB.@distinct
TidierDB.@filter
TidierDB.@full_join
TidierDB.@group_by
TidierDB.@inner_join
TidierDB.@interpolate
TidierDB.@left_join
TidierDB.@mutate
TidierDB.@rename
TidierDB.@right_join
TidierDB.@select
TidierDB.@semi_join
TidierDB.@slice_max
TidierDB.@slice_min
TidierDB.@slice_sample
TidierDB.@summarise
TidierDB.@summarize
TidierDB.@window_frame
TidierDB.@window_order
# TidierDB.connect
\u2014 Method.
connect(backend::Symbol; kwargs...)\n
This function establishes a database connection based on the specified backend and connection parameters and sets the SQL mode
Arguments
backend
: A symbol specifying the database backend to connect to. Supported backends are:
:duckdb
, :lite
(SQLite), :mssql
, mysql
(for MariaDB and MySQL), :clickhouse
, :postgres
kwargs
: Keyword arguments specifying the connection parameters for the selected backend. The required parameters vary depending on the backend:
MySQL:
host
: The host name or IP address of the MySQL server. Default is \"localhost\".user
: The username for authentication. Default is an empty string.password
: The password for authentication.db
: The name of the database to connect to (optional).port
: The port number of the MySQL server (optional).Returns
Examples
**Connect to MySQL**\n\n**conn = connect(:mysql; host=\"localhost\", user=\"root\", password=\"password\", db=\"mydb\")**\n\n**Connect to PostgreSQL using LibPQ**\n\n**conn = connect(:postgres; host=\"localhost\", dbname=\"mydb\", user=\"postgres\", password=\"password\")**\n\n**Connect to ClickHouse**\n\n**conn = connect(:clickhouse; host=\"localhost\", port=9000, database=\"mydb\", user=\"default\", password=\"\")**\n\n**Connect to SQLite**\n\n**conn = connect(:lite)**\n\n**Connect to Google Big Query**\n\n**conn = connect(:gbq, \"json*user*key*path\", \"project*id\")**\n\n**Connect to DuckDB**\n\njulia> db = connect(:duckdb) DuckDB.Connection(\":memory:\")\n\n\n<a target='_blank' href='https://github.com/TidierOrg/TidierDB.jl/blob/d5c110ba18b307d85160bdec171eb326e2f9cf4d/src/TidierDB.jl#L324-L358' class='documenter-source'>source</a><br>\n\n<a id='TidierDB.copy_to-Tuple{Any, Union{AbstractString, DataFrame}, String}' href='#TidierDB.copy_to-Tuple{Any, Union{AbstractString, DataFrame}, String}'>#</a>\n**`TidierDB.copy_to`** — *Method*.\n\n\n\n```julia\n copy_to(conn, df_or_path, \"name\")\n
Allows user to copy a df to the database connection. Currently supports DuckDB, SQLite, MySql
Arguments
-conn
: the database connection -df
: dataframe to be copied or path to serve as source. With DuckDB, path supports .csv, .json, .parquet to be used without copying intermediary df. -name
: name as string for the database to be used
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"test\");\n
source
# TidierDB.@anti_join
\u2014 Macro.
@anti_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform an anti join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @anti_join(df_join, id2, id)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AB aa 2 0.2\n 2 \u2502 AD aa 4 0.4\n 3 \u2502 AF aa 1 0.6\n 4 \u2502 AH aa 3 0.8\n 5 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@arrange
\u2014 Macro.
@arrange(sql_query, columns...)\n
Order SQL table rows based on specified column(s).
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to order the rows by. Can include multiple columns for nested sorting. Wrap column name with desc()
for descending order.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @arrange(value, desc(percent))\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AF aa 1 0.6\n 2 \u2502 AA bb 1 0.1\n 3 \u2502 AG bb 2 0.7\n 4 \u2502 AB aa 2 0.2\n 5 \u2502 AH aa 3 0.8\n 6 \u2502 AC bb 3 0.3\n 7 \u2502 AI bb 4 0.9\n 8 \u2502 AD aa 4 0.4\n 9 \u2502 AJ aa 5 1.0\n 10 \u2502 AE bb 5 0.5\n
source
# TidierDB.@count
\u2014 Macro.
@count(sql_query, columns...)\n
Count the number of rows grouped by specified column(s).
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to group by before counting. If no columns are specified, counts all rows in the query.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @count(groups)\n @arrange(groups)\n @collect\n end\n2\u00d72 DataFrame\n Row \u2502 groups count \n \u2502 String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 5\n 2 \u2502 bb 5\n
source
# TidierDB.@distinct
\u2014 Macro.
@distinct(sql_query, columns...)\n
Select distinct rows based on specified column(s). Distinct works differently in TidierData vs SQL and therefore TidierDB. Distinct will also select only the only columns it is given (or all if given none)
Arguments
sql_query
: The SQL query to operate on. columns
: Columns to determine uniqueness. If no columns are specified, all columns are used to identify distinct rows.
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @distinct(value)\n @arrange(value)\n @collect\n end\n5\u00d71 DataFrame\n Row \u2502 value \n \u2502 Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1\n 2 \u2502 2\n 3 \u2502 3\n 4 \u2502 4\n 5 \u2502 5\n\njulia> @chain db_table(db, :df_mem) begin\n @distinct\n @arrange(id)\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@filter
\u2014 Macro.
@filter(sql_query, conditions...)\n
Filter rows in a SQL table based on specified conditions.
Arguments
sql_query
: The SQL query to filter rows from.conditions
: Expressions specifying the conditions that rows must satisfy to be included in the output. Rows for which the expression evaluates to true
will be included in the result. Multiple conditions can be combined using logical operators (&&
, ||
). It will automatically detect whether the conditions belong in WHERE vs HAVING.
Temporarily, it is best to use begin and end when filtering multiple conditions. (ex 2 below)\n
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @filter(percent > .5)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AF aa 1 0.6\n 2 \u2502 AG bb 2 0.7\n 3 \u2502 AH aa 3 0.8\n 4 \u2502 AI bb 4 0.9\n 5 \u2502 AJ aa 5 1.0\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(mean = mean(percent))\n @filter begin \n groups == \"bb\" || # logical operators can still be used like this\n mean > .5\n end\n @arrange(groups)\n @collect\n end\n2\u00d72 DataFrame\n Row \u2502 groups mean \n \u2502 String? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 0.6\n 2 \u2502 bb 0.5\n
source
# TidierDB.@full_join
\u2014 Macro.
@inner_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform an full join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @full_join(df_join, id2, id)\n @collect\n end\n12\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 AB aa 2 0.2 missing missing missing \n 7 \u2502 AD aa 4 0.4 missing missing missing \n 8 \u2502 AF aa 1 0.6 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n 11 \u2502 missing missing missing missing AK Y 68\n 12 \u2502 missing missing missing missing AM X 74\n
source
# TidierDB.@group_by
\u2014 Macro.
@group_by(sql_query, columns...)\n
Group SQL table rows by specified column(s). If grouping is performed as a terminal operation without a subsequent mutatation or summarization (as in the example below), then the resulting data frame will be ungrouped when @collect
is applied.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions specifying the columns to group by. Columns can be specified by name.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @arrange(groups)\n @collect\n end\n2\u00d71 DataFrame\n Row \u2502 groups \n \u2502 String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa\n 2 \u2502 bb\n
source
# TidierDB.@inner_join
\u2014 Macro.
@inner_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform an inner join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @inner_join(df_join, id2, id)\n @collect\n end\n5\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n
source
# TidierDB.@interpolate
\u2014 Macro.
@interpolate(args...)\n
Interpolate parameters into expressions for database queries.
Arguments
args...
: A variable number of tuples. Each tuple should contain:
name
: The name of the parameter to interpolate.value
: (Any): The value/vector to interpolate for the corresponding parameter name.Example
julia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> col_names = [:id, :value, :percent];\n\njulia> cond1 = .2;\n\njulia> cond2 = 5;\n\njulia> @interpolate((condition1, cond1), (columns, col_names), (condition2, cond2));\n\njulia> @chain db_table(db, \"df_mem\") begin \n @select(!!columns)\n @filter begin \n percent < !!condition1\n value < !!condition2\n end\n @collect\n end\n1\u00d73 DataFrame\n Row \u2502 id value percent \n \u2502 String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA 1 0.1\n
source
# TidierDB.@left_join
\u2014 Macro.
@left_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform a left join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @left_join(df_join, id2, id)\n @collect\n end\n10\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 AB aa 2 0.2 missing missing missing \n 7 \u2502 AD aa 4 0.4 missing missing missing \n 8 \u2502 AF aa 1 0.6 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n
source
# TidierDB.@mutate
\u2014 Macro.
@mutate(sql_query, exprs...)\n
Mutate SQL table rows by adding new columns or modifying existing ones.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions for mutating the table. New columns can be added or existing columns modified using column_name = expression syntax, where expression can involve existing columns.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @mutate(value = value * 4, new_col = percent^2)\n @collect\n end\n10\u00d75 DataFrame\n Row \u2502 id groups value percent new_col \n \u2502 String? String? Int64? Float64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 4 0.1 0.01\n 2 \u2502 AB aa 8 0.2 0.04\n 3 \u2502 AC bb 12 0.3 0.09\n 4 \u2502 AD aa 16 0.4 0.16\n 5 \u2502 AE bb 20 0.5 0.25\n 6 \u2502 AF aa 4 0.6 0.36\n 7 \u2502 AG bb 8 0.7 0.49\n 8 \u2502 AH aa 12 0.8 0.64\n 9 \u2502 AI bb 16 0.9 0.81\n 10 \u2502 AJ aa 20 1.0 1.0\n
source
# TidierDB.@rename
\u2014 Macro.
@rename(sql_query, renamings...)\n
Rename one or more columns in a SQL query.
Arguments
-sql_query
: The SQL query to operate on. -renamings
: One or more pairs of old and new column names, specified as new name = old name
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @rename(new_name = percent)\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value new_name \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@right_join
\u2014 Macro.
@right_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform a right join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @right_join(df_join, id2, id)\n @collect\n end\n7\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 missing missing missing missing AK Y 68\n 7 \u2502 missing missing missing missing AM X 74\n
source
# TidierDB.@select
\u2014 Macro.
@select(sql_query, columns)\n
Select specified columns from a SQL table.
Arguments
sql_query
: The SQL query to select columns from.columns
: Expressions specifying the columns to select. Columns can be specified by name, and new columns can be created with expressions using existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @select(groups:percent)\n @collect\n end\n10\u00d73 DataFrame\n Row \u2502 groups value percent \n \u2502 String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 bb 1 0.1\n 2 \u2502 aa 2 0.2\n 3 \u2502 bb 3 0.3\n 4 \u2502 aa 4 0.4\n 5 \u2502 bb 5 0.5\n 6 \u2502 aa 1 0.6\n 7 \u2502 bb 2 0.7\n 8 \u2502 aa 3 0.8\n 9 \u2502 bb 4 0.9\n 10 \u2502 aa 5 1.0\n\njulia> @chain db_table(db, :df_mem) begin\n @select(contains(\"e\"))\n @collect\n end\n10\u00d72 DataFrame\n Row \u2502 value percent \n \u2502 Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 0.1\n 2 \u2502 2 0.2\n 3 \u2502 3 0.3\n 4 \u2502 4 0.4\n 5 \u2502 5 0.5\n 6 \u2502 1 0.6\n 7 \u2502 2 0.7\n 8 \u2502 3 0.8\n 9 \u2502 4 0.9\n 10 \u2502 5 1.0\n
source
# TidierDB.@semi_join
\u2014 Macro.
@semi_join(sql_query, join_table, new_table_col, orignal_table_col)\n
Perform an semi join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.new_table_col
: Column from the new table that matches for join.orignal_table_col
: Column from the original table that matches for join.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @semi_join(df_join, id2, id)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String? String? Int64? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AC bb 3 0.3\n 3 \u2502 AE bb 5 0.5\n 4 \u2502 AG bb 2 0.7\n 5 \u2502 AI bb 4 0.9\n
source
# TidierDB.@slice_max
\u2014 Macro.
@slice_max(sql_query, column, n = 1)\n
Select rows with the largest values in specified column. This will always return ties.
Arguments
sql_query
: The SQL query to operate on.column
: Column to identify the smallest values.n
: The number of rows to select with the largest values for each specified column. Default is 1, which selects the row with the smallest value.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_max(value, n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_max(value)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 id groups value percent rank_col \n \u2502 String? String? Int64? Float64? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AE bb 5 0.5 1\n 2 \u2502 AJ aa 5 1.0 1\n
source
# TidierDB.@slice_min
\u2014 Macro.
@slice_min(sql_query, column, n = 1)\n
Select rows with the smallest values in specified column. This will always return ties.
Arguments
sql_query
: The SQL query to operate on.column
: Column to identify the smallest values.n
: The number of rows to select with the smallest values for each specified column. Default is 1, which selects the row with the smallest value.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_min(value, n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_min(value)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 id groups value percent rank_col \n \u2502 String? String? Int64? Float64? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 1\n 2 \u2502 AF aa 1 0.6 1\n
source
# TidierDB.@slice_sample
\u2014 Macro.
@slice_sample(sql_query, n)\n
Randomly select a specified number of rows from a SQL table.
Arguments
sql_query
: The SQL query to operate on.n
: The number of rows to randomly select.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_sample(n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_sample()\n @collect\n end;\n
source
# TidierDB.@summarise
\u2014 Macro.
@summarise(sql_query, exprs...)\n
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(across((value:percent), (mean, sum)))\n @arrange(groups)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 groups mean_value mean_percent sum_value sum_percent \n \u2502 String? Float64? Float64? Int128? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 0.6 15 3.0\n 2 \u2502 bb 3.0 0.5 15 2.5\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(test = sum(percent), n = n())\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String? Float64? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n
source
# TidierDB.@summarize
\u2014 Macro.
@summarize(sql_query, exprs...)\n
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(across((ends_with(\"e\"), starts_with(\"p\")), (mean, sum)))\n @arrange(groups)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 groups mean_value mean_percent sum_value sum_percent \n \u2502 String? Float64? Float64? Int128? Float64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 0.6 15 3.0\n 2 \u2502 bb 3.0 0.5 15 2.5\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(test = sum(percent), n = n())\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String? Float64? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n
source
# TidierDB.@window_frame
\u2014 Macro.
@window_frame(sql_query, frame_start::Int, frame_end::Int)\n
Define the window frame for window functions in a SQL query, specifying the range of rows to include in the calculation relative to the current row.
Arguments
sql_query: The SQL query to operate on, expected to be an instance of SQLQuery.
frame_start
: The starting point of the window frame. A positive value indicates the start after the current row (FOLLOWING), a negative value indicates before the current row (PRECEDING), and 0 indicates the current row.frame_end
: The ending point of the window frame. A positive value indicates the end after the current row (FOLLOWING), a negative value indicates before the current row (PRECEDING), and 0 indicates the current row.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n
source
# TidierDB.@window_order
\u2014 Macro.
@window_order(sql_query, columns...)\n
Specify the order of rows for window functions within a SQL query.
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to order the rows by for the window function. Can include multiple columns for nested sorting. Prepend a column name with - for descending order.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(:duckdb);\n\njulia> copy_to(db, df, \"df_mem\");\n
source
"},{"location":"reference/#reference-internal-functions","title":"Reference - Internal functions","text":""},{"location":"examples/generated/UserGuide/athena/","title":"Using Athena","text":"To use the Athena AWS backend with TidierDB, set up and a small syntax difference are covered here.
"},{"location":"examples/generated/UserGuide/athena/#connecting","title":"Connecting","text":"Connection is established through AWS.jl as shwon below.
using TidierDB, AWS\nset_sql_mode(:athena)\n# Replace your credentials as needed below\naws_access_key_id = get(ENV,\"AWS_ACCESS_KEY_ID\",\"key\")\naws_secret_access_key = get(ENV, \"AWS_SECRET_ACCESS_KEY\",\"secret_key\")\naws_region = get(ENV,\"AWS_DEFAULT_REGION\",\"region\")\n\nconst AWS_GLOBAL_CONFIG = Ref{AWS.AWSConfig}()\ncreds = AWSCredentials(aws_access_key_id, aws_secret_access_key)\n\nAWS_GLOBAL_CONFIG[] = AWS.global_aws_config(region=aws_region, creds=creds)\n\ncatalog = \"AwsDataCatalog\"\nworkgroup = \"primary\"\ndb = \"demodb\"\nall_results = true\nresults_per_increment = 10\nout_loc = \"s3://location/\"\n\nathena_params = Dict(\n \"ResultConfiguration\" => Dict(\n \"OutputLocation\" => out_loc\n ),\n \"QueryExecutionContext\" => Dict(\n \"Database\" => db,\n \"Catalog\" => catalog\n ),\n \"Workgroup\" => workgroup\n)\n
"},{"location":"examples/generated/UserGuide/athena/#db_table-differences","title":"db_table
differences","text":"There are two differences for db_table
which are seen in the query below
\"demodb.table_name
db_table
requires a third argument: the athena_params from above.from_query
with Athena to reduce number of queries","text":"Throughout TidierDB, each time db_table
is called, it queries the databases to get the metadata. Consider how AWS Athena logs queries, a user may want to reduce the number of queries. This can be done saving the results of db_table
, and then using from_query with those results for furthe queries as shown below.
mtcars = db_table(AWS_GLOBAL_CONFIG[], \"demodb.mtcars\", athena_params)\n@chain from_query(mtcars) begin\n @filter(cyl > 4)\n @group_by(cyl)\n @summarize(mpg = mean(mpg))\n #@show_query\n @collect\nend\n
2\u00d72 DataFrame\n Row \u2502 cyl mpg\n \u2502 Int64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 6 19.7429\n 2 \u2502 8 15.1\n
I would like to acknowledge the work of Manu Francis and this blog post, which helped guide this process
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/from_queryex/","title":"Reusing Part of a Query","text":"While using TidierDB, you may need to generate part of a query and reuse it multiple times. from_query()
enables a query portion to be reused multiple times as shown below.
import TidierDB as DB\ncon = DB.connect(:duckdb)\nDB.copy_to(con, \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\", \"mtcars2\")\n
Start a query to analyze fuel efficiency by number of cylinders. However, to further build on this query later, end the chain without using @show_query
or @collect
query = DB.@chain DB.db_table(con, :mtcars2) begin\n DB.@group_by cyl\n DB.@summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n DB.@mutate begin\n efficiency = case_when(\n mean_mpg >= 25, \"High\",\n mean_mpg >= 15, \"Moderate\",\n \"Low\" )\n end\nend;\n
Now, from_query
will allow you to reuse the query to calculate the average horsepower for each efficiency category
DB.@chain DB.from_query(query) begin\n DB.@left_join(mtcars2, cyl, cyl)\n DB.@group_by(efficiency)\n DB.@summarize(avg_hp = mean(hp))\n DB.@collect\nend\n
2\u00d72 DataFrame\n Row \u2502 efficiency avg_hp\n \u2502 String? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Moderate 180.238\n 2 \u2502 High 82.6364\n
Reuse the query again to find the car with the highest MPG for each cylinder category
DB.@chain DB.from_query(query) begin\n DB.@left_join(mtcars2, cyl, cyl)\n DB.@group_by cyl\n DB.@slice_max(mpg)\n DB.@select model cyl mpg\n DB.@collect\nend\n
3\u00d73 DataFrame\n Row \u2502 model cyl mpg\n \u2502 String? Int64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Pontiac Firebird 8 19.2\n 2 \u2502 Toyota Corolla 4 33.9\n 3 \u2502 Hornet 4 Drive 6 21.4\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/functions_pass_to_DB/","title":"Writing Functions/Macros with TidierDB Chains","text":"How can functions pass arguments to a TidierDB chain?
In short, you have to use a macro instead in conjuction with @interpolate
To write a macro that will take arguments and pass them to a TidierDB chain, there are 3 steps:
!!
@interpolate
to make these arguemnts accessible to the chain. @interpolate
takes touples as argument (one for the !!
name, and one for the actual content you want the chain to use)@interpolate
and then the chain macro sequentiallyusing TidierDB\npath = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\ncopy_to(db, path, \"mtcars\");\n\n# STEP 1\nmacro f1(conditions, columns) # The arguemnt names will be names of the `!!` values\n return quote\n # add chain here\n @chain db_table(db, :mtcars) begin\n @filter(!!conditions > 3)\n @select(!!columns)\n @aside @show_query _\n @collect\n end # ends the chain\n end # ends the quote.\nend # ends the macro\n
# STEP 2\nvariable = :gear;\ncols = [:model, :mpg, :gear, :wt];\n@interpolate((conditions, variable), (columns, cols));\n@f1(variable, cols)\n
17\u00d74 DataFrame\n Row \u2502 model mpg gear wt\n \u2502 String? Float64? Int32? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 21.0 4 2.62\n 2 \u2502 Mazda RX4 Wag 21.0 4 2.875\n 3 \u2502 Datsun 710 22.8 4 2.32\n \u22ee \u2502 \u22ee \u22ee \u22ee \u22ee\n 15 \u2502 Ferrari Dino 19.7 5 2.77\n 16 \u2502 Maserati Bora 15.0 5 3.57\n 17 \u2502 Volvo 142E 21.4 4 2.78\n 11 rows omitted\n
Lets say you wanted to filter on new variable with a different name and select new columns,
new_condition = :wt;\nnew_cols = [:model, :drat]\n@interpolate((conditions, new_condition), (columns, new_cols));\n@f1(new_condition, new_cols)\n
20\u00d72 DataFrame\n Row \u2502 model drat\n \u2502 String? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Hornet 4 Drive 3.08\n 2 \u2502 Hornet Sportabout 3.15\n 3 \u2502 Valiant 2.76\n \u22ee \u2502 \u22ee \u22ee\n 18 \u2502 Pontiac Firebird 3.08\n 19 \u2502 Ford Pantera L 4.22\n 20 \u2502 Maserati Bora 3.54\n 14 rows omitted\n
You can also interpolate vectors of strings into a @filter(col in (values))
as well by using the following syntax @filter(col in [!!values])
In short, the first argument in @interpolate
must be the name of the macro argument it refers to, and the second argument is what you would like to replace it.
We recognize this adds friction and that it is not ideal, but given the TidierDB macro expressions/string interplay, this is currently the most graceful and functional option available and hopefully a temporary solution to better interpolation that mirrors TidierData.jl.
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/getting_started/","title":"Getting Started","text":"To use TidierDB.jl, you will have to set up a connection. TidierDB.jl gives you access to duckdb via duckdb_open
and duckdb_connect
. However, to use MySql, ClickHouse, MSSQL, Postgres, or SQLite, you will have to load those packages in first.
If you plan to use TidierDB.jl with TidierData.jl or Tidier.jl, it is most convenenient to load the packages as follows:
using TidierData\nimport TidierDB as DB\n
Alternatively, using Tidier
will import TidierDB in the above manner for you, where TidierDB functions and macros will be available as DB.@mutate()
and so on, and the TidierData equivalent would be @mutate()
.
There are two ways to connect to the database. you can use connect
without any need to load any additional packages.
For example Connecting to MySQL
conn = connect(:mysql; host=\"localhost\", user=\"root\", password=\"password\", db=\"mydb\")\n
versus connecting to DuckDB
conn = connect(:duckdb)\n
connect()
and will require using the respective libraries outlined below to establish a connectionAlternatively, you can use the packages outlined below to establish a connection through their respective methods.
For DuckDB, SQLite, and MySQL, copy_to()
lets you copy data to the database and query there. ClickHouse, MSSQL, and Postgres support for copy_to()
has not been added yet.
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/key_differences/","title":"Key Differences from TidierData.jl","text":"There are a few important syntax and behavior differences between TidierDB.jl and TidierData.jl outlined below.
"},{"location":"examples/generated/UserGuide/key_differences/#creating-a-database","title":"Creating a database","text":"For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MariaDB, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database.
using DataFrames, TidierDB\n\ndf = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9],\n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10],\n value = repeat(1:5, 2),\n percent = 0.1:0.1:1.0);\n\ndb = connect(:duckdb);\n\ncopy_to(db, df, \"df_mem\"); # copying over the data frame to an in-memory database\n
"},{"location":"examples/generated/UserGuide/key_differences/#row-ordering","title":"Row ordering","text":"DuckDB benefits from aggressive parallelization of pipelines. This means that if you have multiple threads enabled in Julia, which you can check or set using Threads.nthreads()
, DuckDB will use multiple threads. However, because many operations are multi-threaded, the resulting row order is inconsistent. If row order needs to be deterministic for your use case, make sure to apply an @arrange(column_name_1, column_name_2, etc...)
prior to collecting the results.
When using TidierDB, db_table(connection, :table_name)
is used to start a chain.
In TidierDB, when performing @group_by
then @mutate
, the table will be ungrouped after applying all of the mutations in the clause to the grouped data. To perform subsequent grouped operations, the user would have to regroup the data. This is demonstrated below.
@chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarize(mean_percent = mean(percent))\n @collect\n end\n
2\u00d72 DataFrame Rowgroupsmean_percentString?Float64?1bb0.52aa0.6 Regrouping following @mutate
@chain db_table(db, :df_mem) begin\n @group_by(groups)\n @mutate(max = maximum(percent), min = minimum(percent))\n @group_by(groups)\n @summarise(mean_percent = mean(percent))\n @collect\nend\n
2\u00d72 DataFrame Rowgroupsmean_percentString?Float64?1bb0.52aa0.6 "},{"location":"examples/generated/UserGuide/key_differences/#joining","title":"Joining","text":"There is one key difference for joining:
The column on both the new and old table must be specified. They do not need to be the same, and given SQL behavior where both columns are kept when joining two tables, it is preferable if they have different names. This avoids \"ambiguous reference\" errors that would otherwise come up and complicate the use of tidy selection for columns. Athena has an additional slight difference given the need for parameters, which is covered in the Athena documentation page.
df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\ncopy_to(db, df2, \"df_join\");\n\n@chain db_table(db, :df_mem) begin\n @left_join(df_join, id2, id)\n @collect\nend\n
10\u00d77 DataFrame Rowidgroupsvaluepercentid2categoryscoreString?String?Int64?Float64?String?String?Int64?1AAbb10.1AAX882ACbb30.3ACY923AEbb50.5AEX774AGbb20.7AGY835AIbb40.9AIX956ABaa20.2missingmissingmissing7ADaa40.4missingmissingmissing8AFaa10.6missingmissingmissing9AHaa30.8missingmissingmissing10AJaa51.0missingmissingmissing "},{"location":"examples/generated/UserGuide/key_differences/#differences-in-case_when","title":"Differences in case_when()
","text":"In TidierDB, after the clause is completed, the result for the new column should is separated by a comma ,
in contrast to TidierData.jl, where the result for the new column is separated by a =>
.
@chain db_table(db, :df_mem) begin\n @mutate(new_col = case_when(percent > .5, \"Pass\", # in TidierData, percent > .5 => \"Pass\",\n percent <= .5, \"Try Again\", # percent <= .5 => \"Try Again\"\n true, \"middle\"))\n @collect\n end\n
10\u00d75 DataFrame Rowidgroupsvaluepercentnew_colString?String?Int64?Float64?String?1AAbb10.1Try Again2ABaa20.2Try Again3ACbb30.3Try Again4ADaa40.4Try Again5AEbb50.5Try Again6AFaa10.6Pass7AGbb20.7Pass8AHaa30.8Pass9AIbb40.9Pass10AJaa51.0Pass "},{"location":"examples/generated/UserGuide/key_differences/#interpolation","title":"Interpolation","text":"To use !! Interpolation, instead of being able to define the alternate names/value in the global context, the user has to use @interpolate
. This will hopefully be fixed in future versions. Otherwise, the behavior is generally the same, although this creates friction around calling functions.
Also, when using interpolation with exponenents, the interpolated value must go inside of parenthesis.
@interpolate((test, :percent)); # this still supports strings, vectors of names, and values\n\n@chain db_table(db, :df_mem) begin\n @mutate(new_col = case_when((!!test)^2 > .5, \"Pass\",\n (!!test)^2 < .5, \"Try Again\",\n \"middle\"))\n @collect\nend\n
10\u00d75 DataFrame\n Row \u2502 id groups value percent new_col\n \u2502 String? String? Int64? Float64? String?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 Try Again\n 2 \u2502 AB aa 2 0.2 Try Again\n 3 \u2502 AC bb 3 0.3 Try Again\n \u22ee \u2502 \u22ee \u22ee \u22ee \u22ee \u22ee\n 8 \u2502 AH aa 3 0.8 Pass\n 9 \u2502 AI bb 4 0.9 Pass\n 10 \u2502 AJ aa 5 1.0 Pass\n 4 rows omitted\n
"},{"location":"examples/generated/UserGuide/key_differences/#slicing-ties","title":"Slicing ties","text":"slice_min()
and @slice_max()
will always return ties due to SQL behavior.
This page was generated using Literate.jl.
"}]} \ No newline at end of file