#
TidierDB.copy_to
— Method.
copy_to(conn, df_or_path, "name")
@@ -808,7 +808,7 @@ Reference - Exported functionsjulia> copy_to(db, df, "test");
#
TidierDB.db_table
— Function.
db_table(database, table_name, athena_params, delta = false, iceberg = false)
@@ -848,7 +848,7 @@ Reference - Exported functions 3 โ value BIGINT 1 df_mem
4 โ percent DOUBLE 1 df_mem, false, DuckDB.Connection(":memory:"), TidierDB.CTE[], 0, nothing)
#
TidierDB.show_tables
— Method.
show_tables(con; GBQ_datasetname)
@@ -864,7 +864,7 @@ Reference - Exported functionsjulia> show_tables(db);
#
TidierDB.warnings
— Method.
warnings(show::Bool)
@@ -879,7 +879,7 @@ Reference - Exported functionsjulia> warnings(true);
#
TidierDB.@anti_join
— Macro.
@anti_join(sql_query, join_table, orignal_table_col = new_table_col)
@@ -922,7 +922,7 @@ Reference - Exported functions 4 โ AH aa 3 0.8
5 โ AJ aa 5 1.0
#
TidierDB.@arrange
— Macro.
@arrange(sql_query, columns...)
@@ -962,7 +962,7 @@ Reference - Exported functions 9 โ AJ aa 5 1.0
10 โ AE bb 5 0.5
#
TidierDB.@collect
— Macro.
@collect(sql_query, stream = false)
@@ -999,7 +999,7 @@ Reference - Exported functions 9 โ AI bb 4 0.9
10 โ AJ aa 5 1.0
#
TidierDB.@count
— Macro.
@count(sql_query, columns...)
@@ -1032,7 +1032,7 @@ Reference - Exported functions 1 โ aa 5
2 โ bb 5
#
TidierDB.@create_view
— Macro.
@view(sql_query, name, replace = true)
@@ -1061,7 +1061,7 @@ Reference - Exported functions 1 โ id BIGINT 1 viewer
2 โ value BIGINT 1 viewer, false, DuckDB.DB(":memory:"), TidierDB.CTE[], 0, nothing, "", "", 0)
#
TidierDB.@distinct
— Macro.
@distinct(sql_query, columns...)
@@ -1114,7 +1114,7 @@ Reference - Exported functions 9 โ AI bb 4 0.9
10 โ AJ aa 5 1.0
#
TidierDB.@filter
— Macro.
@filter(sql_query, conditions...)
@@ -1170,7 +1170,7 @@ Reference - Exported functions 1 โ aa 0.6
2 โ bb 0.5
#
TidierDB.@full_join
— Macro.
@inner_join(sql_query, join_table, orignal_table_col = new_table_col)
@@ -1220,7 +1220,7 @@ Reference - Exported functions 10 โ AJ aa 5 1.0 missing missing missing
11 โ missing missing missing missing AM X 74
#
TidierDB.@group_by
— Macro.
@group_by(sql_query, columns...)
@@ -1253,7 +1253,7 @@ Reference - Exported functions 1 โ aa
2 โ bb
#
TidierDB.@head
— Macro.
@head(sql_query, value)
@@ -1284,7 +1284,7 @@ Reference - Exported functionsโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
1 โ AA bb 1 0.1
#
TidierDB.@inner_join
— Macro.
@inner_join(sql_query, join_table, orignal_table_col = new_table_col)
@@ -1327,7 +1327,7 @@ Reference - Exported functions 4 โ AG bb 2 0.7 AG Y 83
5 โ AI bb 4 0.9 AI X 95
#
TidierDB.@left_join
— Macro.
@left_join(sql_query, join_table, orignal_table_col = new_table_col)
@@ -1398,16 +1398,17 @@ Reference - Exported functions 9 โ AH aa 3 0.8 missing missing missing
10 โ AJ aa 5 1.0 missing missing missing
#
TidierDB.@mutate
— Macro.
@mutate(sql_query, exprs...)
+@mutate(sql_query, exprs...; _by)
Mutate SQL table rows by adding new columns or modifying existing ones.
Arguments
sql_query
: The SQL query to operate on.
exprs
: Expressions for mutating the table. New columns can be added or existing columns modified using column_name = expression syntax, where expression can involve existing columns.
+_by
: optional argument that supports single column names, or vectors of columns to allow for grouping for the transformation in the macro call
Examples
julia> df = DataFrame(id = [string('A' + i รท 26, 'A' + i % 26) for i in 0:9],
@@ -1437,8 +1438,27 @@ Reference - Exported functions 8 โ AH aa 12 0.8 0.64
9 โ AI bb 16 0.9 0.81
10 โ AJ aa 20 1.0 1.0
+
+julia> @chain db_table(db, :df_mem) begin
+ @mutate(max = maximum(percent), sum = sum(percent), _by = groups)
+ @collect
+ end
+10ร6 DataFrame
+ Row โ id groups value percent max sum
+ โ String String Int64 Float64 Float64 Float64
+โโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ 1 โ AB aa 2 0.2 1.0 3.0
+ 2 โ AD aa 4 0.4 1.0 3.0
+ 3 โ AF aa 1 0.6 1.0 3.0
+ 4 โ AH aa 3 0.8 1.0 3.0
+ 5 โ AJ aa 5 1.0 1.0 3.0
+ 6 โ AA bb 1 0.1 0.9 2.5
+ 7 โ AC bb 3 0.3 0.9 2.5
+ 8 โ AE bb 5 0.5 0.9 2.5
+ 9 โ AG bb 2 0.7 0.9 2.5
+ 10 โ AI bb 4 0.9 0.9 2.5
-
+
#
TidierDB.@rename
— Macro.
@rename(sql_query, renamings...)
@@ -1475,7 +1495,7 @@ Reference - Exported functions 9 โ AI bb 4 0.9
10 โ AJ aa 5 1.0
-
+
#
TidierDB.@right_join
— Macro.
@right_join(sql_query, join_table, orignal_table_col = new_table_col)
@@ -1539,7 +1559,7 @@ Reference - Exported functions 5 โ AI bb 4 0.9 AI X 95
6 โ missing missing missing missing AM X 74
-
+
#
TidierDB.@select
— Macro.
@select(sql_query, columns)
@@ -1600,7 +1620,7 @@ Reference - Exported functions 9 โ 4 0.9
10 โ 5 1.0
-
+
#
TidierDB.@semi_join
— Macro.
@semi_join(sql_query, join_table, orignal_table_col = new_table_col)
@@ -1643,7 +1663,7 @@ Reference - Exported functions 4 โ AG bb 2 0.7
5 โ AI bb 4 0.9
-
+
#
TidierDB.@slice_max
— Macro.
@slice_max(sql_query, column, n = 1)
@@ -1682,7 +1702,7 @@ Reference - Exported functions 1 โ AE bb 5 0.5 1
2 โ AJ aa 5 1.0 1
-
+
#
TidierDB.@slice_min
— Macro.
@slice_min(sql_query, column, n = 1)
@@ -1721,7 +1741,7 @@ Reference - Exported functions 1 โ AA bb 1 0.1 1
2 โ AF aa 1 0.6 1
-
+
#
TidierDB.@slice_sample
— Macro.
@slice_sample(sql_query, n)
@@ -1753,7 +1773,7 @@ Reference - Exported functions @collect
end;
-
+
#
TidierDB.@summarise
— Macro.
@summarise(sql_query, exprs...)
@@ -1800,16 +1820,17 @@ Reference - Exported functions 1 โ aa 3.0 5
2 โ bb 2.5 5
-
+
#
TidierDB.@summarize
— Macro.
- @summarize(sql_query, exprs...)
+ @summarize(sql_query, exprs...; _by)
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.
exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values.
+_by
: optional argument that supports single column names, or vectors of columns to allow for grouping for the aggregatation in the macro call
Examples
julia> df = DataFrame(id = [string('A' + i รท 26, 'A' + i % 26) for i in 0:9],
@@ -1846,8 +1867,20 @@ Reference - Exported functionsโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโ
1 โ aa 3.0 5
2 โ bb 2.5 5
+
+julia> @chain db_table(db, :df_mem) begin
+ @summarise(test = sum(percent), n = n(), _by = groups)
+ @arrange(groups)
+ @collect
+ end
+2ร3 DataFrame
+ Row โ groups test n
+ โ String Float64 Int64
+โโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโ
+ 1 โ aa 3.0 5
+ 2 โ bb 2.5 5
-
+
#
TidierDB.@union
— Macro.
@union(sql_query1, sql_query2)
@@ -1916,7 +1949,7 @@ Reference - Exported functions 2 โ 2 20
3 โ 3 30
-
+
#
TidierDB.@window_frame
— Macro.
@window_frame(sql_query, args...)
@@ -1976,7 +2009,7 @@ Reference - Exported functions #@show_query
end;
-
+
#
TidierDB.@window_order
— Macro.
@window_order(sql_query, columns...)
@@ -2005,7 +2038,7 @@ Reference - Exported functions #@show_query
end;
-
+
Reference - Internal functionsยค
@@ -2044,7 +2077,7 @@ Reference - Internal functions 5 โ 2 20
6 โ 3 30
-
+
@@ -2121,7 +2154,7 @@ Reference - Internal functions
-
+
@@ -2132,7 +2165,7 @@ Reference - Internal functions
-
+
TiderDB.jl is a 100% Julia implementation of the dbplyr R package, and similar to Python's ibis package.
The main goal of TidierDB.jl is to bring the syntax of Tidier.jl to multiple SQL backends, making it possible to analyze data directly on databases without needing to copy the entire database into memory.
"},{"location":"#currently-supported-backends-include","title":"Currently supported backends include:","text":"DuckDB (default)duckdb()
ClickHouse clickhouse()
SQLite sqlite()
Postgres postgres()
MySQL mysql()
MariaDB mysql()
MSSQL mssql()
Athena athena()
Snowflake snowflake()
Databricks databricks()
Google Big Query gbq()
Oracle oracle()
Change the backend using set_sql_mode()
- for example - set_sql_mode(databricks())
For the stable version:
] add TidierDB\n
TidierDB.jl currently supports:
Category Supported Macros and Functions Data Manipulation@arrange
, @group_by
, @filter
, @select
, @mutate
(supports across
), @summarize
/@summarise
(supports across
), @distinct
Joining @left_join
, @right_join
, @inner_join
, @anti_join
, @full_join
, @semi_join
, @union
, @union_all
Slice and Order @slice_min
, @slice_max
, @slice_sample
, @order
, @window_order
, @window_frame
Utility @show_query
, @collect
, @head
, @count
, show_tables
, @create_view
, drop_view
Helper Functions across
, desc
, if_else
, case_when
, n
, starts_with
, ends_with
, contains
, as_float
, as_integer
, as_string
, is_missing
, missing_if
, replace_missing
TidierStrings.jl Functions str_detect
, str_replace
, str_replace_all
, str_remove_all
, str_remove
TidierDates.jl Functions year
, month
, day
, hour
, min
, second
, floor_date
, difftime
, mdy
, ymd
, dmy
Aggregate Functions mean
, minimum
, maximum
, std
, sum
, cumsum
, cor
, cov
, var
, all aggregate sql fxns @summarize
supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work. @mutate
supports all builtin SQL functions as well.
When using the DuckDB backend, if db_table
recieves a file path ( .parquet
, .json
, .csv
, iceberg
or delta
), it does not copy it into memory. This allows for queries on files too big for memory. db_table
also supports S3 bucket locations via DuckDB.
Typically, you will want to use TidierDB alongside TidierData because there are certain functionality (such as pivoting) which are only supported in TidierData and can only be performed on data frames.
Our recommended path for using TidierDB is to import the package so that there are no namespace conflicts with TidierData. Once TidierDB is integrated with Tidier, then Tidier will automatically load the packages in this fashion.
First, let's develop and execute a query using TidierDB. Notice that all top-level macros and functions originating from TidierDB start with a DB
prefix. Any functions defined within macros do not need to be prefixed within DB
because they are actually pseudofunctions that are in actuality converted into SQL code.
Even though the code reads similarly to TidierData, note that no computational work actually occurs until you run DB.@collect()
, which runs the SQL query and instantiates the result as a DataFrame.
using TidierData\nimport TidierDB as DB\n\ndb = DB.connect(DB.duckdb());\npath_or_name = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\n\n@chain DB.db_table(db, path_or_name) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@collect\nend\n
2\u00d75 DataFrame\n Row \u2502 cyl mpg mpg_squared mpg_rounded mpg_efficiency \n \u2502 Int64? Float64? Float64? Float64? String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 27.3444 747.719 27.0 efficient\n 2 \u2502 6 19.7333 389.404 20.0 moderate\n
"},{"location":"#what-if-we-wanted-to-pivot-the-result","title":"What if we wanted to pivot the result?","text":"We cannot do this using TidierDB. However, we can call @pivot_longer()
from TidierData after the result of the query has been instantiated as a DataFrame, like this:
@chain DB.db_table(db, path_or_name) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@collect\n @pivot_longer(everything(), names_to = \"variable\", values_to = \"value\")\nend\n
10\u00d72 DataFrame\n Row \u2502 variable value \n \u2502 String Any \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 cyl 4\n 2 \u2502 cyl 6\n 3 \u2502 mpg 27.3444\n 4 \u2502 mpg 19.7333\n 5 \u2502 mpg_squared 747.719\n 6 \u2502 mpg_squared 389.404\n 7 \u2502 mpg_rounded 27.0\n 8 \u2502 mpg_rounded 20.0\n 9 \u2502 mpg_efficiency efficient\n 10 \u2502 mpg_efficiency moderate\n
"},{"location":"#what-sql-query-does-tidierdb-generate-for-a-given-piece-of-julia-code","title":"What SQL query does TidierDB generate for a given piece of Julia code?","text":"We can replace DB.collect()
with DB.@show_query
to reveal the underlying SQL query being generated by TidierDB. To handle complex queries, TidierDB makes heavy use of Common Table Expressions (CTE), which are a useful tool to organize long queries.
@chain DB.db_table(db, path_or_name) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@show_query\nend\n
WITH cte_1 AS (\nSELECT *\n FROM mtcars\n WHERE NOT (starts_with(model, 'M'))),\ncte_2 AS (\nSELECT cyl, AVG(mpg) AS mpg\n FROM cte_1\n GROUP BY cyl),\ncte_3 AS (\nSELECT cyl, mpg, POWER(mpg, 2) AS mpg_squared, ROUND(mpg) AS mpg_rounded, CASE WHEN mpg >= POWER(cyl, 2) THEN 'efficient' WHEN mpg < 15.2 THEN 'inefficient' ELSE 'moderate' END AS mpg_efficiency\n FROM cte_2 ),\ncte_4 AS (\nSELECT *\n FROM cte_3\n WHERE mpg_efficiency in ('moderate', 'efficient')) \nSELECT *\n FROM cte_4 \n ORDER BY mpg_rounded DESC\n
"},{"location":"#tidierdb-is-already-quite-fully-featured-supporting-advanced-tidierdata-functions-like-across-for-multi-column-selection","title":"TidierDB is already quite fully-featured, supporting advanced TidierData functions like across()
for multi-column selection.","text":"@chain DB.db_table(db, path_or_name) begin\n DB.@group_by(cyl)\n DB.@summarize(across((starts_with(\"a\"), ends_with(\"s\")), (mean, sum)))\n DB.@collect\nend\n
3\u00d75 DataFrame\n Row \u2502 cyl am_mean vs_mean am_sum vs_sum \n \u2502 Int64? Float64? Float64? Int128? Int128? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 0.727273 0.909091 8 10\n 2 \u2502 6 0.428571 0.571429 3 4\n 3 \u2502 8 0.142857 0.0 2 0\n
Bang bang !!
interpolation for columns and values is also supported.
There are a few subtle but important differences from Tidier.jl outlined here.
"},{"location":"#missing-a-function-or-backend","title":"Missing a function or backend?","text":"You can use any existing SQL function within @mutate
with the correct SQL syntax and it should just work.
But if you run into problems please open an issue, and we will be happy to take a look!
"},{"location":"reference/","title":"Reference","text":""},{"location":"reference/#index","title":"Index","text":"TidierDB.connect
TidierDB.copy_to
TidierDB.db_table
TidierDB.show_tables
TidierDB.warnings
TidierDB.@anti_join
TidierDB.@arrange
TidierDB.@collect
TidierDB.@count
TidierDB.@create_view
TidierDB.@distinct
TidierDB.@filter
TidierDB.@full_join
TidierDB.@group_by
TidierDB.@head
TidierDB.@inner_join
TidierDB.@left_join
TidierDB.@mutate
TidierDB.@rename
TidierDB.@right_join
TidierDB.@select
TidierDB.@semi_join
TidierDB.@slice_max
TidierDB.@slice_min
TidierDB.@slice_sample
TidierDB.@summarise
TidierDB.@summarize
TidierDB.@union
TidierDB.@union_all
TidierDB.@window_frame
TidierDB.@window_order
# TidierDB.connect
\u2014 Method.
connect(backend; kwargs...)\n
This function establishes a database connection based on the specified backend and connection parameters and sets the SQL mode
Arguments
backend
: type specifying the database backend to connect to. Supported backends are:
duckdb()
, sqlite()
(SQLite), mssql()
, mysql()
(for MariaDB and MySQL), clickhouse()
, postgres()
kwargs
: Keyword arguments specifying the connection parameters for the selected backend. The required parameters vary depending on the backend:
MySQL:
host
: The host name or IP address of the MySQL server. Default is \"localhost\".user
: The username for authentication. Default is an empty string.password
: The password for authentication.db
: The name of the database to connect to (optional).port
: The port number of the MySQL server (optional).Returns
Examples
# Connect to MySQL\n# conn = connect(mysql(); host=\"localhost\", user=\"root\", password=\"password\", db=\"mydb\")\n# Connect to PostgreSQL using LibPQ\n# conn = connect(postgres(); host=\"localhost\", dbname=\"mydb\", user=\"postgres\", password=\"password\")\n# Connect to ClickHouse\n# conn = connect(clickhouse(); host=\"localhost\", port=9000, database=\"mydb\", user=\"default\", password=\"\")\n# Connect to SQLite\n# conn = connect(sqlite())\n# Connect to Google Big Query\n# conn = connect(gbq(), \"json_user_key_path\", \"location\")\n# Connect to Snowflake\n# conn = connect(snowflake(), \"ac_id\", \"token\", \"Database_name\", \"Schema_name\", \"warehouse_name\")\n# Connect to Microsoft SQL Server\n# conn = connect(mssql(), \"DRIVER={ODBC Driver 18 for SQL Server};SERVER=host,1433;UID=sa;PWD=YourPassword;Encrypt=no;TrustServerCertificate=yes\")\n# Connect to DuckDB\n# connect to Google Cloud via DuckDB\n# google_db = connect(duckdb(), :gbq, access_key=\"string\", secret_key=\"string\")\n# Connect to AWS via DuckDB\n# aws_db = connect2(duckdb(), :aws, aws_access_key_id=get(ENV, \"AWS_ACCESS_KEY_ID\", \"access_key\"), aws_secret_access_key=get(ENV, \"AWS_SECRET_ACCESS_KEY\", \"secret_access key\"), aws_region=get(ENV, \"AWS_DEFAULT_REGION\", \"us-east-1\"))\n# Connect to MotherDuck\n# connect(duckdb(), \"\"md://...\"\") for first connection, vs connect(duckdb(), \"md:\") for reconnection\n# Connect to exisiting database file\n# connect(duckdb(), \"path/to/database.duckdb\")\n# Open an in-memory database\njulia> db = connect(duckdb())\nDuckDB.Connection(\":memory:\")\n
source
# TidierDB.copy_to
\u2014 Method.
copy_to(conn, df_or_path, \"name\")\n
Allows user to copy a df to the database connection. Currently supports DuckDB, SQLite, MySql
Arguments
-conn
: the database connection -df
: dataframe to be copied or path to serve as source. With DuckDB, path supports .csv, .json, .parquet to be used without copying intermediary df. -name
: name as string for the database to be used
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"test\");\n
source
# TidierDB.db_table
\u2014 Function.
db_table(database, table_name, athena_params, delta = false, iceberg = false)\n
db_table
starts the underlying SQL query struct, adding the metadata and table. If paths are passed directly to db*table instead of a name it will not copy it to memory, but rather ready directly from the file. db*table
only supports direct file paths to a table. It does not support database file paths such asdbname.duckdb
ordbname.sqlite
. Such files must be used withconnect first
Arguments
database
: The Database or connection objecttable_name
: tablename as a string (name, local path, or URL). - CSV/TSV - Parquet - Json - Iceberg - Delta - S3 tables from AWS or Google Cloud
*
wildcards to read all files of a type in a location such as:db_table(db, \"Path/to/testing_files/*.parquet\")
delta
: must be true to read delta filesiceberg
: must be true to read iceberg finalize_ctesExample
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> db_table(db, \"df_mem\")\nTidierDB.SQLQuery(\"\", \"df_mem\", \"\", \"\", \"\", \"\", \"\", \"\", false, false, 4\u00d74 DataFrame\n Row \u2502 name type current_selxn table_name \n \u2502 String? String? Int64 String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 id VARCHAR 1 df_mem\n 2 \u2502 groups VARCHAR 1 df_mem\n 3 \u2502 value BIGINT 1 df_mem\n 4 \u2502 percent DOUBLE 1 df_mem, false, DuckDB.Connection(\":memory:\"), TidierDB.CTE[], 0, nothing)\n
source
# TidierDB.show_tables
\u2014 Method.
show_tables(con; GBQ_datasetname)\n
Shows tables available in database. currently supports DuckDB, databricks, Snowflake, GBQ, SQLite, LibPQ
Arguments
con
: connection to backendGBQ_datasetname
: string of dataset nameExamples
julia> db = connect(duckdb());\n\njulia> show_tables(db);\n
source
# TidierDB.warnings
\u2014 Method.
warnings(show::Bool)\n
Sets the global warning flag to the specified boolean value.
Arguments
flag::Bool
: A boolean value to set the warning flag. If true
, warnings will be enabled; if false
, warnings will be disabled.Default Behavior
By default, the warning flag is set to false
, meaning that warnings are disabled unless explicitly enabled by setting this function with true
.
Example
julia> warnings(true);\n
source
# TidierDB.@anti_join
\u2014 Macro.
@anti_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform an anti join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @anti_join(\"df_join\", id = id2)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AB aa 2 0.2\n 2 \u2502 AD aa 4 0.4\n 3 \u2502 AF aa 1 0.6\n 4 \u2502 AH aa 3 0.8\n 5 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@arrange
\u2014 Macro.
@arrange(sql_query, columns...)\n
Order SQL table rows based on specified column(s).
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to order the rows by. Can include multiple columns for nested sorting. Wrap column name with desc()
for descending order.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @arrange(value, desc(percent))\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AF aa 1 0.6\n 2 \u2502 AA bb 1 0.1\n 3 \u2502 AG bb 2 0.7\n 4 \u2502 AB aa 2 0.2\n 5 \u2502 AH aa 3 0.8\n 6 \u2502 AC bb 3 0.3\n 7 \u2502 AI bb 4 0.9\n 8 \u2502 AD aa 4 0.4\n 9 \u2502 AJ aa 5 1.0\n 10 \u2502 AE bb 5 0.5\n
source
# TidierDB.@collect
\u2014 Macro.
@collect(sql_query, stream = false)\n
db_table
starts the underlying SQL query struct, adding the metadata and table.
Arguments
sql_query
: The SQL query to operate on.stream
: optional streaming for query/execution of results when using duck db. Defaults to falseExample
julia> db = connect(duckdb());\n\njulia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @collect db_table(db, \"df_mem\")\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@count
\u2014 Macro.
@count(sql_query, columns...)\n
Count the number of rows grouped by specified column(s).
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to group by before counting. If no columns are specified, counts all rows in the query.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @count(groups)\n @arrange(groups)\n @collect\n end\n2\u00d72 DataFrame\n Row \u2502 groups count \n \u2502 String Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 5\n 2 \u2502 bb 5\n
source
# TidierDB.@create_view
\u2014 Macro.
@view(sql_query, name, replace = true)\n
Create a view from a SQL query. Currently supports DuckDB, MySQL, GBQ, Postgres
Arguments
sql_query
: The SQL query to create a view from.name
: The name of the view to create.replace
: defaults to true if view should be replacedExamples
julia> db = connect(duckdb());\n\njulia> df = DataFrame(id = [1, 2, 3], value = [10, 20, 30]);\n\njulia> copy_to(db, df, \"df1\");\n\njulia> @chain db_table(db, \"df1\") @create_view(viewer);\n\njulia> db_table(db, \"viewer\")\nTidierDB.SQLQuery(\"\", \"viewer\", \"\", \"\", \"\", \"\", \"\", \"\", false, false, 2\u00d74 DataFrame\n Row \u2502 name type current_selxn table_name \n \u2502 String String Int64 String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 id BIGINT 1 viewer\n 2 \u2502 value BIGINT 1 viewer, false, DuckDB.DB(\":memory:\"), TidierDB.CTE[], 0, nothing, \"\", \"\", 0)\n
source
# TidierDB.@distinct
\u2014 Macro.
@distinct(sql_query, columns...)\n
Select distinct rows based on specified column(s). Distinct works differently in TidierData vs SQL and therefore TidierDB. Distinct will also select only the only columns it is given (or all if given none)
Arguments
sql_query
: The SQL query to operate on. columns
: Columns to determine uniqueness. If no columns are specified, all columns are used to identify distinct rows.
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @distinct(value)\n @arrange(value)\n @collect\n end\n5\u00d71 DataFrame\n Row \u2502 value \n \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1\n 2 \u2502 2\n 3 \u2502 3\n 4 \u2502 4\n 5 \u2502 5\n\njulia> @chain db_table(db, :df_mem) begin\n @distinct\n @arrange(id)\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@filter
\u2014 Macro.
@filter(sql_query, conditions...)\n
Filter rows in a SQL table based on specified conditions.
Arguments
sql_query
: The SQL query to filter rows from.conditions
: Expressions specifying the conditions that rows must satisfy to be included in the output. Rows for which the expression evaluates to true
will be included in the result. Multiple conditions can be combined using logical operators (&&
, ||
). It will automatically detect whether the conditions belong in WHERE vs HAVING.
Temporarily, it is best to use begin and end when filtering multiple conditions. (ex 2 below)\n
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @filter(percent > .5)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AF aa 1 0.6\n 2 \u2502 AG bb 2 0.7\n 3 \u2502 AH aa 3 0.8\n 4 \u2502 AI bb 4 0.9\n 5 \u2502 AJ aa 5 1.0\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(mean = mean(percent))\n @filter begin \n groups == \"bb\" || # logical operators can still be used like this\n mean > .5\n end\n @arrange(groups)\n @collect\n end\n2\u00d72 DataFrame\n Row \u2502 groups mean \n \u2502 String Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 0.6\n 2 \u2502 bb 0.5\n
source
# TidierDB.@full_join
\u2014 Macro.
@inner_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform an full join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @full_join((@chain db_table(db, \"df_join\") @filter(score > 70)), id)\n #@aside @show_query _\n @collect\n end\n11\u00d77 DataFrame\n Row \u2502 id groups value percent id_1 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 AB aa 2 0.2 missing missing missing \n 7 \u2502 AD aa 4 0.4 missing missing missing \n 8 \u2502 AF aa 1 0.6 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n 11 \u2502 missing missing missing missing AM X 74\n
source
# TidierDB.@group_by
\u2014 Macro.
@group_by(sql_query, columns...)\n
Group SQL table rows by specified column(s). If grouping is performed as a terminal operation without a subsequent mutatation or summarization (as in the example below), then the resulting data frame will be ungrouped when @collect
is applied.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions specifying the columns to group by. Columns can be specified by name.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @arrange(groups)\n @collect\n end\n2\u00d71 DataFrame\n Row \u2502 groups \n \u2502 String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa\n 2 \u2502 bb\n
source
# TidierDB.@head
\u2014 Macro.
@head(sql_query, value)\n
Limit SQL table number of rows returned based on specified value. LIMIT
in SQL
Arguments
sql_query
: The SQL query to operate on.value
: Number to limit how many rows are returned. If left empty, it will default to 6 rowsExamples
julia> db = connect(duckdb());\n\njulia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> copy_to(db, df, \"df_mem\"); \n\njulia> @chain db_table(db, :df_mem) begin\n @head(1) ## supports expressions ie `3-2` would return the same df below\n @collect\n end\n1\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n
source
# TidierDB.@inner_join
\u2014 Macro.
@inner_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform an inner join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @inner_join(\"df_join\", \"id\" = id2)\n @collect\n end\n5\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String String Int64 Float64 String String Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n
source
# TidierDB.@left_join
\u2014 Macro.
@left_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform a left join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, \"df_mem\") begin\n @left_join(\"df_join\", \"id\" = \"id2\" )\n @collect\n end\n10\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String String Int64 Float64 String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 AB aa 2 0.2 missing missing missing \n 7 \u2502 AD aa 4 0.4 missing missing missing \n 8 \u2502 AF aa 1 0.6 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n\njulia> query = @chain db_table(db, \"df_join\") begin\n @filter(score > 85) # only show scores above 85 in joining table\n end;\n\njulia> @chain db_table(db, \"df_mem\") begin\n @left_join(t(query), id = id2)\n @collect\n end\n10\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String String Int64 Float64 String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AI bb 4 0.9 AI X 95\n 4 \u2502 AB aa 2 0.2 missing missing missing \n 5 \u2502 AD aa 4 0.4 missing missing missing \n 6 \u2502 AE bb 5 0.5 missing missing missing \n 7 \u2502 AF aa 1 0.6 missing missing missing \n 8 \u2502 AG bb 2 0.7 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n
source
# TidierDB.@mutate
\u2014 Macro.
@mutate(sql_query, exprs...)\n
Mutate SQL table rows by adding new columns or modifying existing ones.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions for mutating the table. New columns can be added or existing columns modified using column_name = expression syntax, where expression can involve existing columns.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @mutate(value = value * 4, new_col = percent^2)\n @collect\n end\n10\u00d75 DataFrame\n Row \u2502 id groups value percent new_col \n \u2502 String String Int64 Float64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 4 0.1 0.01\n 2 \u2502 AB aa 8 0.2 0.04\n 3 \u2502 AC bb 12 0.3 0.09\n 4 \u2502 AD aa 16 0.4 0.16\n 5 \u2502 AE bb 20 0.5 0.25\n 6 \u2502 AF aa 4 0.6 0.36\n 7 \u2502 AG bb 8 0.7 0.49\n 8 \u2502 AH aa 12 0.8 0.64\n 9 \u2502 AI bb 16 0.9 0.81\n 10 \u2502 AJ aa 20 1.0 1.0\n
source
# TidierDB.@rename
\u2014 Macro.
@rename(sql_query, renamings...)\n
Rename one or more columns in a SQL query.
Arguments
-sql_query
: The SQL query to operate on. -renamings
: One or more pairs of old and new column names, specified as new name = old name
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @rename(new_name = percent)\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value new_name \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@right_join
\u2014 Macro.
@right_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform a right join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @right_join(\"df_join\", id = id2)\n @collect\n end\n7\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String String Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 missing missing missing missing AK Y 68\n 7 \u2502 missing missing missing missing AM X 74\n\njulia> query = @chain db_table(db, \"df_join\") begin\n @filter(score >= 74) # only show scores above 85 in joining table\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @right_join(t(query), id = id2)\n @collect\n end\n6\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String String Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 missing missing missing missing AM X 74\n
source
# TidierDB.@select
\u2014 Macro.
@select(sql_query, columns)\n
Select specified columns from a SQL table.
Arguments
sql_query
: The SQL query to select columns from.columns
: Expressions specifying the columns to select. Columns can be specified by name, and new columns can be created with expressions using existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> df_mem = db_table(db, :df_mem);\n\njulia> @chain t(df_mem) begin\n @select(groups:percent)\n @collect\n end\n10\u00d73 DataFrame\n Row \u2502 groups value percent \n \u2502 String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 bb 1 0.1\n 2 \u2502 aa 2 0.2\n 3 \u2502 bb 3 0.3\n 4 \u2502 aa 4 0.4\n 5 \u2502 bb 5 0.5\n 6 \u2502 aa 1 0.6\n 7 \u2502 bb 2 0.7\n 8 \u2502 aa 3 0.8\n 9 \u2502 bb 4 0.9\n 10 \u2502 aa 5 1.0\n\njulia> @chain t(df_mem) begin\n @select(contains(\"e\"))\n @collect\n end\n10\u00d72 DataFrame\n Row \u2502 value percent \n \u2502 Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 0.1\n 2 \u2502 2 0.2\n 3 \u2502 3 0.3\n 4 \u2502 4 0.4\n 5 \u2502 5 0.5\n 6 \u2502 1 0.6\n 7 \u2502 2 0.7\n 8 \u2502 3 0.8\n 9 \u2502 4 0.9\n 10 \u2502 5 1.0\n
source
# TidierDB.@semi_join
\u2014 Macro.
@semi_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform an semi join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @semi_join(\"df_join\", id = id2)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AC bb 3 0.3\n 3 \u2502 AE bb 5 0.5\n 4 \u2502 AG bb 2 0.7\n 5 \u2502 AI bb 4 0.9\n
source
# TidierDB.@slice_max
\u2014 Macro.
@slice_max(sql_query, column, n = 1)\n
Select rows with the largest values in specified column. This will always return ties.
Arguments
sql_query
: The SQL query to operate on.column
: Column to identify the smallest values.n
: The number of rows to select with the largest values for each specified column. Default is 1, which selects the row with the smallest value.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_max(value, n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_max(value)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 id groups value percent rank_col \n \u2502 String String Int64 Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AE bb 5 0.5 1\n 2 \u2502 AJ aa 5 1.0 1\n
source
# TidierDB.@slice_min
\u2014 Macro.
@slice_min(sql_query, column, n = 1)\n
Select rows with the smallest values in specified column. This will always return ties.
Arguments
sql_query
: The SQL query to operate on.column
: Column to identify the smallest values.n
: The number of rows to select with the smallest values for each specified column. Default is 1, which selects the row with the smallest value.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_min(value, n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_min(value)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 id groups value percent rank_col \n \u2502 String String Int64 Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 1\n 2 \u2502 AF aa 1 0.6 1\n
source
# TidierDB.@slice_sample
\u2014 Macro.
@slice_sample(sql_query, n)\n
Randomly select a specified number of rows from a SQL table.
Arguments
sql_query
: The SQL query to operate on.n
: The number of rows to randomly select.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_sample(n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_sample()\n @collect\n end;\n
source
# TidierDB.@summarise
\u2014 Macro.
@summarise(sql_query, exprs...)\n
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(across((value:percent), (mean, sum)))\n @arrange(groups)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 groups value_mean percent_mean value_sum percent_sum \n \u2502 String Float64 Float64 Int128 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 0.6 15 3.0\n 2 \u2502 bb 3.0 0.5 15 2.5\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(test = sum(percent), n = n())\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n
source
# TidierDB.@summarize
\u2014 Macro.
@summarize(sql_query, exprs...)\n
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(across((ends_with(\"e\"), starts_with(\"p\")), (mean, sum)))\n @arrange(groups)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 groups value_mean percent_mean value_sum percent_sum \n \u2502 String Float64 Float64 Int128 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 0.6 15 3.0\n 2 \u2502 bb 3.0 0.5 15 2.5\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(test = sum(percent), n = n())\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n
source
# TidierDB.@union
\u2014 Macro.
@union(sql_query1, sql_query2)\n
Combine two SQL queries using the UNION
operator.
Arguments
sql_query1
: The first SQL query to combine.sql_query2
: The second SQL query to combine.Returns
Examples
julia> db = connect(duckdb());\n\njulia> df1 = DataFrame(id = [1, 2, 3], value = [10, 20, 30]);\n\njulia> df2 = DataFrame(id = [4, 5, 6], value = [40, 50, 60]);\n\njulia> copy_to(db, df1, \"df1\");\n\njulia> copy_to(db, df2, \"df2\");\n\njulia> df1_table = db_table(db, \"df1\");\n\njulia> df2_table = db_table(db, \"df2\");\n\njulia> @chain t(df1_table) @union(df2_table) @collect\n6\u00d72 DataFrame\n Row \u2502 id value \n \u2502 Int64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 10\n 2 \u2502 2 20\n 3 \u2502 3 30\n 4 \u2502 4 40\n 5 \u2502 5 50\n 6 \u2502 6 60\n\njulia> query = @chain t(df2_table) @filter(value == 50);\n\njulia> @chain t(df1_table) begin \n @union(t(query))\n @collect\n end\n4\u00d72 DataFrame\n Row \u2502 id value \n \u2502 Int64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 10\n 2 \u2502 2 20\n 3 \u2502 3 30\n 4 \u2502 5 50\n\njulia> @chain t(df1_table) begin \n @union(t(df1_table))\n @collect\n end\n3\u00d72 DataFrame\n Row \u2502 id value \n \u2502 Int64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 10\n 2 \u2502 2 20\n 3 \u2502 3 30\n
source
# TidierDB.@window_frame
\u2014 Macro.
@window_frame(sql_query, args...)\n
Define the window frame for window functions in a SQL query, specifying the range of rows to include in the calculation relative to the current row.
Arguments
sqlquery::SQLQuery
: The SQLQuery instance to which the window frame will be applied.args...
: A variable number of arguments specifying the frame boundaries. These can be:
from
: The starting point of the frame. Can be a positive or negative integer, 0 or empty. When empty, it will use UNBOUNDEDto
: The ending point of the frame. Can be a positive or negative integer, 0 or empty. When empty, it will use UNBOUNDEDto
or from
it will default to from, and to will be UNBOUNDED.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> df_mem = db_table(db, :df_mem);\n\njulia> @chain t(df_mem) begin\n @group_by groups\n @window_frame(3)\n @mutate(avg = mean(percent))\n #@show_query\n end;\n\njulia> @chain t(df_mem) begin\n @group_by groups\n @window_frame(-3, 3)\n @mutate(avg = mean(percent))\n #@show_query\n end;\n\njulia> @chain t(df_mem) begin\n @group_by groups\n # @window_frame(to = -3)\n @mutate(avg = mean(percent))\n #@show_query\n @collect\n end;\n\njulia> @chain t(df_mem) begin\n @group_by groups\n @window_frame()\n @mutate(avg = mean(percent))\n #@show_query\n end;\n
source
# TidierDB.@window_order
\u2014 Macro.
@window_order(sql_query, columns...)\n
Specify the order of rows for window functions within a SQL query.
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to order the rows by for the window function. Can include multiple columns for nested sorting. Prepend a column name with - for descending order.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by groups\n @window_frame(3)\n @window_order(desc(percent))\n @mutate(avg = mean(value))\n #@show_query \n end;\n
source
"},{"location":"reference/#reference-internal-functions","title":"Reference - Internal functions","text":"# TidierDB.@union_all
\u2014 Macro.
@union(sql_query1, sql_query2)\n
Combine two SQL queries using the UNION ALL
operator.
Arguments
sql_query1
: The first SQL query to combine.sql_query2
: The second SQL query to combine.Returns
Examples
julia> db = connect(duckdb());\n\njulia> df1 = DataFrame(id = [1, 2, 3], value = [10, 20, 30]);\n\njulia> copy_to(db, df1, \"df1\");\n\njulia> df1_table = db_table(db, \"df1\");\n\njulia> @chain t(df1_table) @union_all(df1_table) @collect\n6\u00d72 DataFrame\n Row \u2502 id value \n \u2502 Int64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 10\n 2 \u2502 2 20\n 3 \u2502 3 30\n 4 \u2502 1 10\n 5 \u2502 2 20\n 6 \u2502 3 30\n
source
"},{"location":"examples/generated/UserGuide/Snowflake/","title":"Using Snowflake","text":"Establishing a connection with the Snowflake SQL Rest API requires a OAuth token specific to the Role the user will use to query tables with.
"},{"location":"examples/generated/UserGuide/Snowflake/#connecting","title":"Connecting","text":"Connection is established with the connect
function as shown below. Connection requires 5 items as strings
Two things to note:
Since each time db_table
runs, it runs a query to pull the metadata, you may choose to use run db_table
and save the results, and use these results withfrom_query()
@show_query
even if the OAuthtoken has expired. To @collect
you will have to reconnect and rerun dbtable if your OAuth token has expiredset_sql_mode(snowflake())\nac_id = \"string_id\"\ntoken = \"OAuth_token_string\"\ncon = connect(:snowflake, ac_id, token, \"DEMODB\", \"PUBLIC\", \"COMPUTE_WH\")\n# After connection is established, a you may begin querying.\nstable_table_metadata = db_table(con, \"MTCARS\")\n@chain from_query(stable_table_metadata) begin\n @select(WT)\n @mutate(TEST = WT *2)\n #@aside @show_query _\n @collect\nend\n
32\u00d72 DataFrame\n Row \u2502 WT TEST\n \u2502 Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 2.62 5.24\n 2 \u2502 2.875 5.75\n 3 \u2502 2.32 4.64\n 4 \u2502 3.215 6.43\n \u22ee \u2502 \u22ee \u22ee\n 29 \u2502 3.17 6.34\n 30 \u2502 2.77 5.54\n 31 \u2502 3.57 7.14\n 32 \u2502 2.78 5.56\n 24 rows omitted\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/athena/","title":"Using Athena","text":"To use the Athena AWS backend with TidierDB, set up and a small syntax difference are covered here.
"},{"location":"examples/generated/UserGuide/athena/#connecting","title":"Connecting","text":"Connection is established through AWS.jl as shwon below.
using TidierDB, AWS\nset_sql_mode(athena())\n# Replace your credentials as needed below\naws_access_key_id = get(ENV,\"AWS_ACCESS_KEY_ID\",\"key\")\naws_secret_access_key = get(ENV, \"AWS_SECRET_ACCESS_KEY\",\"secret_key\")\naws_region = get(ENV,\"AWS_DEFAULT_REGION\",\"region\")\n\nconst AWS_GLOBAL_CONFIG = Ref{AWS.AWSConfig}()\ncreds = AWSCredentials(aws_access_key_id, aws_secret_access_key)\n\nAWS_GLOBAL_CONFIG[] = AWS.global_aws_config(region=aws_region, creds=creds)\n\ncatalog = \"AwsDataCatalog\"\nworkgroup = \"primary\"\ndb = \"demodb\"\nall_results = true\nresults_per_increment = 10\nout_loc = \"s3://location/\"\n\nathena_params = Dict(\n \"ResultConfiguration\" => Dict(\n \"OutputLocation\" => out_loc\n ),\n \"QueryExecutionContext\" => Dict(\n \"Database\" => db,\n \"Catalog\" => catalog\n ),\n \"Workgroup\" => workgroup\n)\n
"},{"location":"examples/generated/UserGuide/athena/#db_table-differences","title":"db_table
differences","text":"There are two differences for db_table
which are seen in the query below
\"demodb.table_name
db_table
requires a third argument: the athena_params from above.from_query
with Athena to reduce number of queries","text":"Throughout TidierDB, each time db_table
is called, it queries the databases to get the metadata. Consider how AWS Athena logs queries, a user may want to reduce the number of queries. This can be done saving the results of db_table
, and then using from_query with those results for furthe queries as shown below.
mtcars = db_table(AWS_GLOBAL_CONFIG[], \"demodb.mtcars\", athena_params)\n@chain from_query(mtcars) begin\n @filter(cyl > 4)\n @group_by(cyl)\n @summarize(mpg = mean(mpg))\n #@show_query\n @collect\nend\n
2\u00d72 DataFrame\n Row \u2502 cyl mpg\n \u2502 Int64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 6 19.7429\n 2 \u2502 8 15.1\n
I would like to acknowledge the work of Manu Francis and this blog post, which helped guide this process
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/databricks/","title":"Using Databricks","text":"Establishing a connection with the Databricks SQL Rest API requires a token.
"},{"location":"examples/generated/UserGuide/databricks/#connecting","title":"Connecting","text":"Connection is established with the connect
function as shown below. Connection requires 5 items as strings
One thing to note, Since each time db_table
runs, it runs a query to pull the metadata, you may choose to use run db_table
and save the results, and use these results with from_query()
. This will reduce the number of queries to your database and is illustrated below.
set_sql_mode(databricks())\ninstance_id = \"string_id\"\ntoken \"string_token\"\nwarehouse_id = \"e673cd4f387f964a\"\ncon = connect(:databricks, instance_id, token, \"DEMODB\", \"PUBLIC\", warehouse_id)\n# After connection is established, a you may begin querying.\nstable_table_metadata = db_table(con, \"mtcars\")\n@chain from_query(stable_table_metadata) begin\n @select(wt)\n @mutate(test = wt *2)\n #@aside @show_query _\n @collect\nend\n
32\u00d72 DataFrame\n Row \u2502 wt test\n \u2502 Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 2.62 5.24\n 2 \u2502 2.875 5.75\n 3 \u2502 2.32 4.64\n 4 \u2502 3.215 6.43\n \u22ee \u2502 \u22ee \u22ee\n 29 \u2502 3.17 6.34\n 30 \u2502 2.77 5.54\n 31 \u2502 3.57 7.14\n 32 \u2502 2.78 5.56\n 24 rows omitted\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/duckplyr_reprex/","title":"Reproduce a duckplyr example","text":"In this example, we will reproduce a DuckDB and duckplyr blog post example to demonstrate TidierDB's v0.5.0 capability.
The example by Hannes that is being reproduced is exploring Open Data from the New Zealand government that is ~ 1GB.
"},{"location":"examples/generated/UserGuide/duckplyr_reprex/#set-up","title":"Set up","text":"First we will set up the local duckdb database and pull in the metadata for the files. Notice we are not reading this data into memory, only the paths and and column, and table names. To follow along, copy the set up code below after downloading the data, but add the directory to the local data.
import TidierDB as DB\ndb = DB.connect(DB.duckdb())\n\ndir = \"/Downloads/nzcensus/\"\ndata = dir * \"Data8277.csv\"\nage = dir * \"DimenLookupAge8277.csv\"\narea = dir * \"DimenLookupArea8277.csv\"\nethnic = dir * \"DimenLookupEthnic8277.csv\"\nsex = dir * \"DimenLookupSex8277.csv\"\nyear = dir * \"DimenLookupYear8277.csv\"\n\ndata = DB.db_table(db, data);\nage = DB.db_table(db, age);\narea = DB.db_table(db, area);\nethnic = DB.db_table(db, ethnic);\nsex = DB.db_table(db, sex);\nyear = DB.db_table(db, year);\n
"},{"location":"examples/generated/UserGuide/duckplyr_reprex/#exploration","title":"Exploration","text":"While this long chain could be broken up into multiple smaller chains, lets reproduce the duckplyr code from example and demonstrate how TidierDB also supports multiple joins after filtering, mutating, etc the joining tables. 6 different tables are being joined together through sequential inner joins.
@chain DB.t(data) begin\n DB.@filter(str_detect(count, r\"^\\d+$\"))\n DB.@mutate(count_ = as_integer(count))\n DB.@filter(count_ > 0)\n DB.@inner_join(\n (@chain DB.t(age) begin\n DB.@filter(str_detect(Description, r\"^\\d+ years$\"))\n DB.@mutate(age_ = as_integer(str_remove(Code, \"years\"))) end),\n Age = Code\n )\n DB.@inner_join((@chain DB.t(year) DB.@mutate(year_ = Description)), year = Code)\n DB.@inner_join((@chain DB.t(area) begin\n DB.@mutate(area_ = Description)\n DB.@filter(!str_detect(area_, r\"^Total\"))\n end)\n , Area = Code)\n DB.@inner_join((@chain DB.t(ethnic) begin\n DB.@mutate(ethnic_ = Description)\n DB.@filter(!str_detect( ethnic_, r\"^Total\",)) end), Ethnic = Code)\n DB.@inner_join((@chain DB.t(sex) begin\n DB.@mutate(sex_ = Description)\n DB.@filter(!str_detect( sex_, r\"^Total\"))\n end)\n , Sex = Code)\n DB.@inner_join((@chain DB.t(year) DB.@mutate(year_ = Description)), Year = Code)\n @aside DB.@show_query _\n DB.@create_view(joined_up)\nend;\n\n@chain DB.db_table(db, \"joined_up\") begin\n DB.@filter begin\n age_ >= 20\n age_ <= 40\n str_detect(area_, r\"^Auckland\")\n year_ == \"2018\"\n ethnic_ != \"European\"\n end\n DB.@group_by sex_\n DB.@summarise(group_count = sum(count_))\n DB.@collect\nend\n
"},{"location":"examples/generated/UserGuide/duckplyr_reprex/#results","title":"Results","text":"When we collect this to a local dataframe, we can see that the results match the duckplyr/DuckDB example.
2\u00d72 DataFrame\n Row \u2502 sex_ group_count\n \u2502 String Int128\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Female 398556\n 2 \u2502 Male 397326\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/ex_joining/","title":"Joining Tables","text":"This page will illustrate how to join different tables in TidierDB. The examples will use the mtcars
dataset and a synthetic dataset called mt2
hosted on a personal MotherDuck instance. Examples will cover how to join tables with different schemas in different databases, and how to write queries on tables and then join them together, and how to do this by levaraging views.
using TidierDB\ndb = connect(duckdb(), \"md:\")\n\nmtcars = db_table(db, \"my_db.mtcars\")\nmt2 = db_table(db, \"ducks_db.mt2\")\n
"},{"location":"examples/generated/UserGuide/ex_joining/#wrangle-tables-and-self-join","title":"Wrangle tables and self join","text":"query = @chain t(mtcars) begin\n @group_by cyl\n @summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n @mutate begin\n efficiency = case_when(\n mpg_mean >= 25, \"High\",\n mpg_mean >= 15, \"Moderate\",\n \"Low\" )\n end\nend;\n\nquery2 = @chain t(mtcars) @filter(mpg>20) @mutate(mpg = mpg *4);\n\n@chain t(query) begin\n @left_join(t(query2), cyl, cyl)\n @group_by(efficiency)\n @summarize(avg_mean = mean(mpg))\n @mutate(mean = avg_mean / 4 )\n @aside @show_query _\n @collect\nend\n
2\u00d73 DataFrame\n Row \u2502 efficiency avg_mean mean\n \u2502 String Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 High 106.655 26.6636\n 2 \u2502 Moderate 84.5333 21.1333\n
"},{"location":"examples/generated/UserGuide/ex_joining/#different-schemas","title":"Different schemas","text":"To connect to a table in a different schema, prefix it with a dot. For example, \"schemaname.tablename\". In this query, we are also filtering out cars that contain \"M\" in the name from the mt2
table before joining.
other_db = @chain db_table(db, \"ducks_db.mt2\") @filter(!str_detect(car, \"M\"))\n@chain t(mtcars) begin\n @left_join(t(other_db), car, model)\n @select(car, model)\n @head(5)\n @collect\nend\n
5\u00d72 DataFrame\n Row \u2502 car model\n \u2502 String String\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Datsun 710 Datsun 710\n 2 \u2502 Hornet 4 Drive Hornet 4 Drive\n 3 \u2502 Hornet Sportabout Hornet Sportabout\n 4 \u2502 Valiant Valiant\n 5 \u2502 Duster 360 Duster 360\n
To join directly to the table, you can use the @left_join
macro with the table name as a string.
@chain t(mtcars) begin\n @left_join(\"ducks_db.mt2\", car, model)\n @select(car, model)\n @head(5)\n @collect\nend\n
5\u00d72 DataFrame\n Row \u2502 car model\n \u2502 String String\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 Mazda RX4\n 2 \u2502 Mazda RX4 Wag Mazda RX4 Wag\n 3 \u2502 Datsun 710 Datsun 710\n 4 \u2502 Hornet 4 Drive Hornet 4 Drive\n 5 \u2502 Hornet Sportabout Hornet Sportabout\n
"},{"location":"examples/generated/UserGuide/ex_joining/#using-a-view","title":"Using a View","text":"You can also use @create_view
to create views and then join them. This is an alternate reuse complex queries.
# notice, this is not begin saved, bc a view is created in the database at the end of the chain\n@chain t(mtcars) begin\n @group_by cyl\n @summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n @mutate begin\n efficiency = case_when(\n mpg_mean >= 25, \"High\",\n mpg_mean >= 15, \"Moderate\",\n \"Low\" )\n end\n #create a view in the database\n @create_view(viewer)\nend;\n\n# access the view like as if it was any other table\n@chain db_table(db, \"viewer\") begin\n @left_join(t(query2), cyl, cyl)\n @group_by(efficiency)\n @summarize(avg_mean = mean(mpg))\n @mutate(mean = avg_mean / 4 )\n @collect\nend\n
2\u00d73 DataFrame\n Row \u2502 efficiency avg_mean mean\n \u2502 String Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 High 106.655 26.6636\n 2 \u2502 Moderate 84.5333 21.1333\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/from_queryex/","title":"Reusing a Query (and Views)","text":"While using TidierDB, you may need to generate part of a query and reuse it multiple times. There are two ways to do this
from_query(query)
or its alias t(query)
@create_view(name)
import TidierDB as DB\ncon = DB.connect(duckdb())\nmtcars_path = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\nmtcars = DB.db_table(con, mtcars_path)\n
Start a query to analyze fuel efficiency by number of cylinders. However, to further build on this query later, end the chain without using @show_query
or @collect
query = DB.@chain DB.t(mtcars) begin\n DB.@group_by cyl\n DB.@summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n DB.@mutate begin\n efficiency = case_when(\n mpg_mean >= 25, \"High\",\n mpg_mean >= 15, \"Moderate\",\n \"Low\" )\n end\nend;\n
"},{"location":"examples/generated/UserGuide/from_queryex/#from_query-or-tquery","title":"from_query()
or t(query)
","text":"Now, from_query
, or t()
a convienece wrapper, will allow you to reuse the query to calculate the average horsepower for each efficiency category
DB.@chain DB.t(query) begin\n DB.@left_join(DB.t(mtcars), cyl = cyl)\n DB.@group_by(efficiency)\n DB.@summarize(avg_hp = mean(hp))\n DB.@collect\nend\n
2\u00d72 DataFrame\n Row \u2502 efficiency avg_hp\n \u2502 String? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Moderate 180.238\n 2 \u2502 High 82.6364\n
"},{"location":"examples/generated/UserGuide/from_queryex/#create_view","title":"@create_view","text":"This can also be done with @create_view
.
query2 = @chain t(mtcars) @filter(mpg>20) @mutate(mpg = mpg *4);\nDB.@chain DB.db_table(db, \"mtcars\") begin\n DB.@group_by cyl\n DB.@summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n DB.@mutate begin\n efficiency = case_when(\n mpg_mean >= 25, \"High\",\n mpg_mean >= 15, \"Moderate\",\n \"Low\" )\n end\n DB.@create_view(viewer)\n end;\n\n\nDB.@chain DB.db_table(db, \"viewer\") begin\n DB.@left_join(DB.t(query2), cyl = cyl)\n DB.@group_by(efficiency)\n DB.@summarize(avg_mean = mean(mpg))\n DB.@mutate(mean = avg_mean / 4 )\n @aside DB.@show_query _\n DB.@collect\nend\n2\u00d73 DataFrame\n Row \u2502 efficiency avg_mean mean\n \u2502 String Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 High 106.655 26.6636\n 2 \u2502 Moderate 84.5333 21.1333\n
"},{"location":"examples/generated/UserGuide/from_queryex/#preview-or-save-an-intermediate-table","title":"Preview or save an intermediate table","text":"While querying a dataset, you may wish to see an intermediate table, or even save it. You can use @aside
and from_query(_)
, illustrated below, to do just that. While we opted to print the results in this simple example below, we could have saved them by using name = DB.@chain...
import ClickHouse;\nconn = conn = DB.connect(DB.clickhouse(); host=\"localhost\", port=19000, database=\"default\", user=\"default\", password=\"\")\npath = \"https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet\"\nDB.@chain DB.db_table(conn, path) begin\n DB.@count(cyl)\n @aside println(DB.@chain DB.from_query(_) DB.@head(5) DB.@collect)\n DB.@arrange(desc(count))\n DB.@collect\nend\n
5\u00d72 DataFrame\n Row \u2502 artists count\n \u2502 String? UInt64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 missing 1\n 2 \u2502 Wizo 3\n 3 \u2502 MAGIC! 3\n 4 \u2502 Macaco 1\n 5 \u2502 SOYOU 1\n31438\u00d72 DataFrame\n Row \u2502 artists count\n \u2502 String? UInt64\n\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 The Beatles 279\n 2 \u2502 George Jones 271\n 3 \u2502 Stevie Wonder 236\n 4 \u2502 Linkin Park 224\n 5 \u2502 Ella Fitzgerald 222\n 6 \u2502 Prateek Kuhad 217\n 7 \u2502 Feid 202\n \u22ee \u2502 \u22ee \u22ee\n 31432 \u2502 Leonard 1\n 31433 \u2502 marcos g 1\n 31434 \u2502 BLVKSHP 1\n 31435 \u2502 Memtrix 1\n 31436 \u2502 SOYOU 1\n 31437 \u2502 Macaco 1\n 31438 \u2502 missing 1\n 31424 rows omitted\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/functions_pass_to_DB/","title":"Writing Functions with TidierDB Chains","text":"On this page, we'll briefly explore how to use TidierDB macros and $
witth @eval
to bulid a function
For a more indepth explanation, please check out the TidierData page on interpolation
using TidierDB, DataFrames;\n\ndb = connect(duckdb());\ndf = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9],\n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10],\n value = repeat(1:5, 2),\n percent = 0.1:0.1:1.0);\ncopy_to(db, df, \"dfm\");\ndf_mem = db_table(db, \"dfm\");\n
"},{"location":"examples/generated/UserGuide/functions_pass_to_DB/#interpolation","title":"Interpolation","text":"Variables are interpoated using @eval
and $
. Place @eval
before you begin the chain or call a TidierDb macro Why Use @eval? In Julia, macros like @filter are expanded at parse time, before runtime variables like vals are available. By using @eval, we force the expression to be evaluated at runtime, allowing us to interpolate the variable into the macro.
num = [3];\ncolumn = :id;\n@eval @chain t(df_mem) begin\n @filter(value in $num)\n @select($column)\n @collect\n end\n
2\u00d71 DataFrame RowidString1AC2AH "},{"location":"examples/generated/UserGuide/functions_pass_to_DB/#function-set-up","title":"Function set up","text":"Begin by defining your function as your normally would, but before @chain
you need to use @eval
. For the variables to be interpolated in need to be started with $
function test(vals, cols)\n @eval @chain t(df_mem) begin\n @filter(value in $vals)\n @select($cols)\n @collect\n end\nend;\n\nvals = [1, 2, 3, 3];\ntest(vals, [:groups, :value, :percent])\n
6\u00d73 DataFrame RowgroupsvaluepercentStringInt64Float641bb10.12aa10.63aa20.24bb20.75bb30.36aa30.8 Now with a new variable
other_vals = [1];\ncols = [:value, :percent];\ntest(other_vals, cols)\n
2\u00d72 DataFrame RowvaluepercentInt64Float64110.1210.6 Defineing a new function
function gs(groups, aggs, new_name, threshold)\n @eval @chain t(df_mem) begin\n @group_by($groups)\n @summarize($new_name = mean($aggs))\n @filter($new_name > $threshold)\n @collect\n end\nend;\n\ngs(:groups, :percent, :mean_percent, .5)\n
1\u00d72 DataFrame Rowgroupsmean_percentStringFloat641aa0.6 Change the column and threshold
gs(:groups, :value, :mean_value, 2)\n
2\u00d72 DataFrame Rowgroupsmean_valueStringFloat641bb3.02aa3.0 "},{"location":"examples/generated/UserGuide/functions_pass_to_DB/#write-pipeline-function-to-use-inside-of-chains","title":"Write pipeline function to use inside of chains","text":"Lets say there is a particular sequence of macros that you want repeatedly use. Wrap this series into a function that accepts a t(query
as its first argument and returns a SQLquery
and you can easily resuse it.
function moving_aggs(table, start, stop, group, order, col)\n qry = @eval @chain $table begin\n @group_by $group\n @window_frame $start $stop\n @window_order $order\n @mutate(across($col, (minimum, maximum, mean)))\n end\n return qry\nend;\n\n@chain t(df_mem) begin\n moving_aggs(-2, 1, :groups, :percent, :value)\n @filter value_mean > 2.75\n @aside @show_query _\n @collect\nend\n
6\u00d77 DataFrame Rowidgroupsvaluepercentvalue_minimumvalue_maximumvalue_meanStringStringInt64Float64Int64Int64Float641ABaa20.2243.02AHaa30.8153.253AJaa51.0153.04ACbb30.3153.05AGbb20.7253.56AIbb40.9253.66667 Filtering before the window functions
@chain t(df_mem) begin\n @filter(value >=2 )\n moving_aggs(-1, 1, :groups, :percent, :value)\n @aside @show_query _\n @collect\nend\n
8\u00d77 DataFrame Rowidgroupsvaluepercentvalue_minimumvalue_maximumvalue_meanStringStringInt64Float64Int64Int64Float641ABaa20.2243.02ADaa40.4243.03AHaa30.8354.04AJaa51.0354.05ACbb30.3354.06AEbb50.5253.333337AGbb20.7253.666678AIbb40.9243.0 "},{"location":"examples/generated/UserGuide/functions_pass_to_DB/#interpolating-queries","title":"Interpolating Queries","text":"To use a prior, uncollected TidierDB query in other TidierDB macros, interpolate the needed query without showing or collecting it
ok = @chain t(df_mem) @summarize(mean = mean(value));\n
The mean value represented in SQL from the above is 3
With @filter
@eval @chain t(df_mem) begin\n @filter( value > $ok)\n @collect\nend\n
4\u00d74 DataFrame RowidgroupsvaluepercentStringStringInt64Float641ADaa40.42AEbb50.53AIbb40.94AJaa51.0 With @mutate
@eval @chain t(df_mem) begin\n @mutate(value2 = value + $ok)\n @collect\nend\n
10\u00d75 DataFrame Rowidgroupsvaluepercentvalue2StringStringInt64Float64Float641AAbb10.14.02ABaa20.25.03ACbb30.36.04ADaa40.47.05AEbb50.58.06AFaa10.64.07AGbb20.75.08AHaa30.86.09AIbb40.97.010AJaa51.08.0 With @summarize
@eval @chain t(df_mem) begin\n @summarize(value = mean(value) * $ok)\n @collect\nend\n
1\u00d71 DataFrame RowvalueFloat6419.0 This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/getting_started/","title":"Getting Started","text":"To use TidierDB.jl, you will have to set up a connection. TidierDB.jl gives you access to duckdb via duckdb_open
and duckdb_connect
. However, to use MySql, ClickHouse, MSSQL, Postgres, or SQLite, you will have to load those packages in first.
If you plan to use TidierDB.jl with TidierData.jl or Tidier.jl, it is most convenenient to load the packages as follows:
using TidierData\nimport TidierDB as DB\n
Alternatively, using Tidier
will import TidierDB in the above manner for you, where TidierDB functions and macros will be available as DB.@mutate()
and so on, and the TidierData equivalent would be @mutate()
.
To connect to a database, you can uset the connect
function as shown below, or establish your own connection through the respecitve libraries.
For example Connecting to MySQL
conn = DB.connect(DB.mysql(); host=\"localhost\", user=\"root\", password=\"password\", db=\"mydb\")\n
versus connecting to DuckDB
conn = DB.connect(DB.duckdb())\n
"},{"location":"examples/generated/UserGuide/getting_started/#connect-to-a-local-database-file","title":"Connect to a local database file","text":"You can also connect to an existing database by passing the database file path as a string.
db = DB.connect(DB.duckdb(), \"mydb.duckdb\")\n
You can also establish any DuckDB connection through an alternate method that you prefer, and use that as your connection as well.
"},{"location":"examples/generated/UserGuide/getting_started/#package-extensions","title":"Package Extensions","text":"The following backends utilize package extensions. To use one of backends listed below, you will need to write using Library
import ClickHouse
using MySQL
using ODBC
using LibPQ
using SQLite
using AWS
using ODBC
using GoogleCloud
db_table
","text":"What does db_table
do?
db_table
starts the underlying SQL query struct, in addition to pulling the table metadata and storing it there. Storing metadata is what enables a lazy interface that also supports tidy selection.
db_table
has two required arguments: connection
and table
table
can be a table name on a database or a path/url to file to read. When passing db_table
a path or url, the table is not copied into memory.
db_table
only support direct file paths to a table. It does not support database file paths such as dbname.duckdb
or dbname.sqlite
. Such files must be used with connect
first.*
read in all files matching the pattern..csv
in the given folder.db_table(db, \"folder/path/*.csv\")\n
db_table
also supports iceberg, delta, and S3 file paths via DuckDB.
If you are working with a backend where compute cost is important, it will be important to minimize using db_table
as this will requery for metadata each time. Compute costs are relevant to backends such as AWS, databricks and Snowflake.
To do this, save the results of db_table
and use them with t
. Using t
pulls the relevant information (metadata, con, etc) from the mutable SQLquery struct, allowing you to repeatedly query and collect the table without requerying for the metadata each time
!Tip: t()
is an alias for from_query
This means after saving the results of db_table
, use t(table)
to refer to the table or prior query
table = DB.db_table(con, \"path\")\n@chain DB.t(table) begin\n ## data wrangling here\nend\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/ibis_comp/","title":"TidierDB.jl vs Ibis","text":""},{"location":"examples/generated/UserGuide/ibis_comp/#comparing-tidierdb-vs-ibis","title":"Comparing TidierDB vs Ibis","text":"TidierDB is a reimplementation of dbplyr from R, so the syntax is remarkably similar. But how does TidierDB compare to Python's Ibis? This page will perform a similar comparison to the Ibis Documentation comparing Ibis and dplyr
"},{"location":"examples/generated/UserGuide/ibis_comp/#set-up","title":"Set up","text":"Ibis
import ibis\nimport ibis.selectors as s # allows for different styles of column selection\nfrom ibis import _ # eliminates need to type table name before each column vs typing cols as strings\nibis.options.interactive = True # automatically collects first 10 rows of table\n\ncon = ibis.connect(\"duckdb://\")\n
TidierDB
using TidierDB\ndb = connect(duckdb())\n
Of note, TidierDB does not yet have an \"interactive mode\" so each example result will be collected.
"},{"location":"examples/generated/UserGuide/ibis_comp/#loading-data","title":"Loading Data","text":"With Ibis, there are specific functions to read in different file types
mtcars = con.read_csv(\"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\")\n
In TidierDB, there is only db_table
, which determines the file type and generates the syntax appropriate for the backend in use.
mtcars = db_table(db, \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\");\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#previewing-the-data","title":"Previewing the data","text":"TidierDB and Ibis use head
/@head
to preview the first rows of a dataset.
Ibis
mtcars.head(6)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 mpg \u2503 cyl \u2503 disp \u2503 hp \u2503 drat \u2503 wt \u2503 qsec \u2503 vs \u2503 am \u2503 gear \u2503 carb \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502 int64 \u2502 float64 \u2502 int64 \u2502 float64 \u2502 float64 \u2502 float64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Mazda RX4 \u2502 21.0 \u2502 6 \u2502 160.0 \u2502 110 \u2502 3.90 \u2502 2.620 \u2502 16.46 \u2502 0 \u2502 1 \u2502 4 \u2502 4 \u2502\n\u2502 Mazda RX4 Wag \u2502 21.0 \u2502 6 \u2502 160.0 \u2502 110 \u2502 3.90 \u2502 2.875 \u2502 17.02 \u2502 0 \u2502 1 \u2502 4 \u2502 4 \u2502\n\u2502 Datsun 710 \u2502 22.8 \u2502 4 \u2502 108.0 \u2502 93 \u2502 3.85 \u2502 2.320 \u2502 18.61 \u2502 1 \u2502 1 \u2502 4 \u2502 1 \u2502\n\u2502 Hornet 4 Drive \u2502 21.4 \u2502 6 \u2502 258.0 \u2502 110 \u2502 3.08 \u2502 3.215 \u2502 19.44 \u2502 1 \u2502 0 \u2502 3 \u2502 1 \u2502\n\u2502 Hornet Sportabout \u2502 18.7 \u2502 8 \u2502 360.0 \u2502 175 \u2502 3.15 \u2502 3.440 \u2502 17.02 \u2502 0 \u2502 0 \u2502 3 \u2502 2 \u2502\n\u2502 Valiant \u2502 18.1 \u2502 6 \u2502 225.0 \u2502 105 \u2502 2.76 \u2502 3.460 \u2502 20.22 \u2502 1 \u2502 0 \u2502 3 \u2502 1 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB
@chain t(mtcars) @head(6) @collect\n
6\u00d712 DataFrame\n Row \u2502 model mpg cyl disp hp drat wt qsec vs am gear carb\n \u2502 String? Float64? Int64? Float64? Int64? Float64? Float64? Float64? Int64? Int64? Int64? Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4\n 2 \u2502 Mazda RX4 Wag 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4\n 3 \u2502 Datsun 710 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1\n 4 \u2502 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n 5 \u2502 Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2\n 6 \u2502 Valiant 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#filtering","title":"Filtering","text":"The example below demonstrates how to filter using multiple criteria in both Ibis and TidierData Ibis
mtcars.filter(((_.mpg > 22) & (_.drat > 4) | (_.hp == 113)))\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 mpg \u2503 cyl \u2503 disp \u2503 hp \u2503 drat \u2503 wt \u2503 qsec \u2503 vs \u2503 am \u2503 gear \u2503 carb \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502 int64 \u2502 float64 \u2502 int64 \u2502 float64 \u2502 float64 \u2502 float64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Lotus Europa \u2502 30.4 \u2502 4 \u2502 95.1 \u2502 113 \u2502 3.77 \u2502 1.513 \u2502 16.90 \u2502 1 \u2502 1 \u2502 5 \u2502 2 \u2502\n\u2502 Fiat 128 \u2502 32.4 \u2502 4 \u2502 78.7 \u2502 66 \u2502 4.08 \u2502 2.200 \u2502 19.47 \u2502 1 \u2502 1 \u2502 4 \u2502 1 \u2502\n\u2502 Honda Civic \u2502 30.4 \u2502 4 \u2502 75.7 \u2502 52 \u2502 4.93 \u2502 1.615 \u2502 18.52 \u2502 1 \u2502 1 \u2502 4 \u2502 2 \u2502\n\u2502 Toyota Corolla \u2502 33.9 \u2502 4 \u2502 71.1 \u2502 65 \u2502 4.22 \u2502 1.835 \u2502 19.90 \u2502 1 \u2502 1 \u2502 4 \u2502 1 \u2502\n\u2502 Fiat X1-9 \u2502 27.3 \u2502 4 \u2502 79.0 \u2502 66 \u2502 4.08 \u2502 1.935 \u2502 18.90 \u2502 1 \u2502 1 \u2502 4 \u2502 1 \u2502\n\u2502 Porsche 914-2 \u2502 26.0 \u2502 4 \u2502 120.3 \u2502 91 \u2502 4.43 \u2502 2.140 \u2502 16.70 \u2502 0 \u2502 1 \u2502 5 \u2502 2 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB
@chain t(mtcars) begin\n @filter((mpg > 22 && drat > 4) || hp == 113)\n @collect\nend\n
6\u00d712 DataFrame\n Row \u2502 model mpg cyl disp hp drat wt qsec vs am gear carb\n \u2502 String? Float64? Int64? Float64? Int64? Float64? Float64? Float64? Int64? Int64? Int64? Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2\n 2 \u2502 Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1\n 3 \u2502 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n 4 \u2502 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1\n 5 \u2502 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.9 1 1 4 1\n 6 \u2502 Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#creating-new-columns","title":"Creating new columns","text":"Both TidierDB and Ibis use mutate
/@mutate
to add new columns
Ibis
(\n mtcars\n .mutate(kpg = _.mpg * 1.61)\n .select(\"model\", \"kpg\")\n)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 kpg \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Mazda RX4 \u2502 33.810 \u2502\n\u2502 Mazda RX4 Wag \u2502 33.810 \u2502\n\u2502 Datsun 710 \u2502 36.708 \u2502\n\u2502 Hornet 4 Drive \u2502 34.454 \u2502\n\u2502 Hornet Sportabout \u2502 30.107 \u2502\n\u2502 Valiant \u2502 29.141 \u2502\n\u2502 Duster 360 \u2502 23.023 \u2502\n\u2502 Merc 240D \u2502 39.284 \u2502\n\u2502 Merc 230 \u2502 36.708 \u2502\n\u2502 Merc 280 \u2502 30.912 \u2502\n\u2502 \u2026 \u2502 \u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB
@chain t(mtcars) begin\n @mutate(kpg = mpg * 1.61)\n @select(model, kpg)\n @collect\nend\n
32\u00d72 DataFrame\n Row \u2502 model kpg\n \u2502 String? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 33.81\n 2 \u2502 Mazda RX4 Wag 33.81\n 3 \u2502 Datsun 710 36.708\n 4 \u2502 Hornet 4 Drive 34.454\n 5 \u2502 Hornet Sportabout 30.107\n 6 \u2502 Valiant 29.141\n \u22ee \u2502 \u22ee \u22ee\n 27 \u2502 Porsche 914-2 41.86\n 28 \u2502 Lotus Europa 48.944\n 29 \u2502 Ford Pantera L 25.438\n 30 \u2502 Ferrari Dino 31.717\n 31 \u2502 Maserati Bora 24.15\n 32 \u2502 Volvo 142E 34.454\n 20 rows omitted\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#sorting-columns","title":"Sorting columns","text":"Ibis uses order_by
similar to SQLs ORDER BY
Ibis
mtcars.order_by(_.mpg)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 mpg \u2503 cyl \u2503 disp \u2503 hp \u2503 drat \u2503 wt \u2503 qsec \u2503 vs \u2503 am \u2503 gear \u2503 carb \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502 int64 \u2502 float64 \u2502 int64 \u2502 float64 \u2502 float64 \u2502 float64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Cadillac Fleetwood \u2502 10.4 \u2502 8 \u2502 472.0 \u2502 205 \u2502 2.93 \u2502 5.250 \u2502 17.98 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Lincoln Continental \u2502 10.4 \u2502 8 \u2502 460.0 \u2502 215 \u2502 3.00 \u2502 5.424 \u2502 17.82 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Camaro Z28 \u2502 13.3 \u2502 8 \u2502 350.0 \u2502 245 \u2502 3.73 \u2502 3.840 \u2502 15.41 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Duster 360 \u2502 14.3 \u2502 8 \u2502 360.0 \u2502 245 \u2502 3.21 \u2502 3.570 \u2502 15.84 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Chrysler Imperial \u2502 14.7 \u2502 8 \u2502 440.0 \u2502 230 \u2502 3.23 \u2502 5.345 \u2502 17.42 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Maserati Bora \u2502 15.0 \u2502 8 \u2502 301.0 \u2502 335 \u2502 3.54 \u2502 3.570 \u2502 14.60 \u2502 0 \u2502 1 \u2502 5 \u2502 8 \u2502\n\u2502 Merc 450SLC \u2502 15.2 \u2502 8 \u2502 275.8 \u2502 180 \u2502 3.07 \u2502 3.780 \u2502 18.00 \u2502 0 \u2502 0 \u2502 3 \u2502 3 \u2502\n\u2502 AMC Javelin \u2502 15.2 \u2502 8 \u2502 304.0 \u2502 150 \u2502 3.15 \u2502 3.435 \u2502 17.30 \u2502 0 \u2502 0 \u2502 3 \u2502 2 \u2502\n\u2502 Dodge Challenger \u2502 15.5 \u2502 8 \u2502 318.0 \u2502 150 \u2502 2.76 \u2502 3.520 \u2502 16.87 \u2502 0 \u2502 0 \u2502 3 \u2502 2 \u2502\n\u2502 Ford Pantera L \u2502 15.8 \u2502 8 \u2502 351.0 \u2502 264 \u2502 4.22 \u2502 3.170 \u2502 14.50 \u2502 0 \u2502 1 \u2502 5 \u2502 4 \u2502\n\u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
While TidierDB uses @arrange
like TidierData.jl
TidierDB
@chain t(mtcars) @arrange(mpg) @collect\n
32\u00d712 DataFrame\n Row \u2502 model mpg cyl disp hp drat wt qsec vs am gear carb\n \u2502 String? Float64? Int64? Float64? Int64? Float64? Float64? Float64? Int64? Int64? Int64? Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.25 17.98 0 0 3 4\n 2 \u2502 Lincoln Continental 10.4 8 460.0 215 3.0 5.424 17.82 0 0 3 4\n 3 \u2502 Camaro Z28 13.3 8 350.0 245 3.73 3.84 15.41 0 0 3 4\n 4 \u2502 Duster 360 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4\n 5 \u2502 Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n 6 \u2502 Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8\n \u22ee \u2502 \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee\n 27 \u2502 Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2\n 28 \u2502 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.9 1 1 4 1\n 29 \u2502 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n 30 \u2502 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2\n 31 \u2502 Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1\n 32 \u2502 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1\n 20 rows omitted\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#selecting-columns","title":"Selecting columns","text":"In Ibis, columns must be prefixed with the table name, or in this case _
, or they can be given as a string. Finally to using helper functions like startswith
requires importing selectors as above.
Ibis
mtcars.select(s.startswith(\"m\"), \"drat\", _.wt)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 mpg \u2503 drat \u2503 wt \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502 float64 \u2502 float64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Mazda RX4 \u2502 21.0 \u2502 3.90 \u2502 2.620 \u2502\n\u2502 Mazda RX4 Wag \u2502 21.0 \u2502 3.90 \u2502 2.875 \u2502\n\u2502 Datsun 710 \u2502 22.8 \u2502 3.85 \u2502 2.320 \u2502\n\u2502 Hornet 4 Drive \u2502 21.4 \u2502 3.08 \u2502 3.215 \u2502\n\u2502 Hornet Sportabout \u2502 18.7 \u2502 3.15 \u2502 3.440 \u2502\n\u2502 Valiant \u2502 18.1 \u2502 2.76 \u2502 3.460 \u2502\n\u2502 Duster 360 \u2502 14.3 \u2502 3.21 \u2502 3.570 \u2502\n\u2502 Merc 240D \u2502 24.4 \u2502 3.69 \u2502 3.190 \u2502\n\u2502 Merc 230 \u2502 22.8 \u2502 3.92 \u2502 3.150 \u2502\n\u2502 Merc 280 \u2502 19.2 \u2502 3.92 \u2502 3.440 \u2502\n\u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB does not require names to be prefixed and, like TidierData, tidy column selection with starts_with
, ends_with
, and contains
is supported at base. TidierDB also supports providing column names as strings, although this would only be needed in the setting of renaming a column with a space in it.
TidierDB
@chain t(mtcars) @select(starts_with(\"m\"), \"drat\", wt) @collect\n
32\u00d74 DataFrame\n Row \u2502 model mpg drat wt\n \u2502 String? Float64? Float64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 21.0 3.9 2.62\n 2 \u2502 Mazda RX4 Wag 21.0 3.9 2.875\n 3 \u2502 Datsun 710 22.8 3.85 2.32\n 4 \u2502 Hornet 4 Drive 21.4 3.08 3.215\n 5 \u2502 Hornet Sportabout 18.7 3.15 3.44\n 6 \u2502 Valiant 18.1 2.76 3.46\n \u22ee \u2502 \u22ee \u22ee \u22ee \u22ee\n 27 \u2502 Porsche 914-2 26.0 4.43 2.14\n 28 \u2502 Lotus Europa 30.4 3.77 1.513\n 29 \u2502 Ford Pantera L 15.8 4.22 3.17\n 30 \u2502 Ferrari Dino 19.7 3.62 2.77\n 31 \u2502 Maserati Bora 15.0 3.54 3.57\n 32 \u2502 Volvo 142E 21.4 4.11 2.78\n 20 rows omitted\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#multi-step-queries-and-summarizing","title":"Multi step queries and summarizing","text":"Aggregating data is done with aggregate
in Ibis and @summarize
in TidierDB. To group data, both utilze group_by
/@group_by
Ibis
mtcars.group_by(._cyl).aggregate(\n total_hp=_.hp.sum(),\n avg_hp=_.hp.mean()\n).filter(_.total_hp < 1000)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 cyl \u2503 total_hp \u2503 avg_hp \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 int64 \u2502 int64 \u2502 float64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 6 \u2502 856 \u2502 122.285714 \u2502\n\u2502 4 \u2502 909 \u2502 82.636364 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
In TidierDB, @filter
will automatically determine whether the criteria belong in a WHERE
or HAVING
SQL clause.
TidierDB
@chain t(mtcars) begin\n @group_by(cyl)\n @summarize(total_hp = sum(hp),\n avg_hp = avg(hp))\n @filter(total_hp < 1000)\n @collect\nend\n
2\u00d73 DataFrame\n Row \u2502 cyl total_hp avg_hp\n \u2502 Int64? Int128? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 6 856 122.286\n 2 \u2502 4 909 82.6364\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#renaming-columns","title":"Renaming columns","text":"Both tools use rename
/@rename to rename columns
Ibis
mtcars.rename(make_model = \"model\").select(_.make_model)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 make_model \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Mazda RX4 \u2502\n\u2502 Mazda RX4 Wag \u2502\n\u2502 Datsun 710 \u2502\n\u2502 Hornet 4 Drive \u2502\n\u2502 Hornet Sportabout \u2502\n\u2502 Valiant \u2502\n\u2502 Duster 360 \u2502\n\u2502 Merc 240D \u2502\n\u2502 Merc 230 \u2502\n\u2502 Merc 280 \u2502\n\u2502 \u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB
@chain t(mtcars) @rename(model_make = model) @select(model_make) @collect\n
32\u00d71 DataFrame\n Row \u2502 model_make\n \u2502 String?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4\n 2 \u2502 Mazda RX4 Wag\n 3 \u2502 Datsun 710\n 4 \u2502 Hornet 4 Drive\n 5 \u2502 Hornet Sportabout\n 6 \u2502 Valiant\n \u22ee \u2502 \u22ee\n 27 \u2502 Porsche 914-2\n 28 \u2502 Lotus Europa\n 29 \u2502 Ford Pantera L\n 30 \u2502 Ferrari Dino\n 31 \u2502 Maserati Bora\n 32 \u2502 Volvo 142E\n 20 rows omitted\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/key_differences/","title":"Key Differences from TidierData.jl","text":"There are a few important syntax and behavior differences between TidierDB.jl and TidierData.jl outlined below.
"},{"location":"examples/generated/UserGuide/key_differences/#creating-a-database","title":"Creating a database","text":"For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MariaDB, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database.
using DataFrames, TidierDB\n\ndf = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9],\n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10],\n value = repeat(1:5, 2),\n percent = 0.1:0.1:1.0);\n\ndb = connect(duckdb());\n\ncopy_to(db, df, \"df_mem\"); # copying over the data frame to an in-memory database\n
"},{"location":"examples/generated/UserGuide/key_differences/#row-ordering","title":"Row ordering","text":"DuckDB benefits from aggressive parallelization of pipelines. This means that if you have multiple threads enabled in Julia, which you can check or set using Threads.nthreads()
, DuckDB will use multiple threads. However, because many operations are multi-threaded, the resulting row order is inconsistent. If row order needs to be deterministic for your use case, make sure to apply an @arrange(column_name_1, column_name_2, etc...)
prior to collecting the results.
When using TidierDB, db_table(connection, :table_name)
is used to start a chain.
In TidierDB, when performing @group_by
then @mutate
, the table will be ungrouped after applying all of the mutations in the clause to the grouped data. To perform subsequent grouped operations, the user would have to regroup the data. This is demonstrated below.
@chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarize(mean_percent = mean(percent))\n @collect\n end\n
2\u00d72 DataFrame Rowgroupsmean_percentStringFloat641bb0.52aa0.6 Regrouping following @mutate
@chain db_table(db, :df_mem) begin\n @group_by(groups)\n @mutate(max = maximum(percent), min = minimum(percent))\n @group_by(groups)\n @summarise(mean_percent = mean(percent))\n @collect\nend\n
2\u00d72 DataFrame Rowgroupsmean_percentStringFloat641aa0.62bb0.5 "},{"location":"examples/generated/UserGuide/key_differences/#differences-in-case_when","title":"Differences in case_when()
","text":"In TidierDB, after the clause is completed, the result for the new column should is separated by a comma ,
in contrast to TidierData.jl, where the result for the new column is separated by a =>
.
@chain db_table(db, :df_mem) begin\n @mutate(new_col = case_when(percent > .5, \"Pass\", # in TidierData, percent > .5 => \"Pass\",\n percent <= .5, \"Try Again\", # percent <= .5 => \"Try Again\"\n true, \"middle\"))\n @collect\n end\n
10\u00d75 DataFrame Rowidgroupsvaluepercentnew_colStringStringInt64Float64String1AAbb10.1Try Again2ABaa20.2Try Again3ACbb30.3Try Again4ADaa40.4Try Again5AEbb50.5Try Again6AFaa10.6Pass7AGbb20.7Pass8AHaa30.8Pass9AIbb40.9Pass10AJaa51.0Pass "},{"location":"examples/generated/UserGuide/key_differences/#joining-tables","title":"Joining Tables","text":"When joining a table, the column from both tables will be present, in contrast to TidierData which will keep one column
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/outofmemex/","title":"Working With Larger than RAM Datasets","text":"While using the DuckDB backend, TidierDB's lazy intferace enables querying datasets larger than your available RAM.
To illustrate this, we will recreate the Hugging Face x Polars example. The final table results are shown below and in this Hugging Face x DuckDB example
First we will load TidierDB, set up a local database and then set the URLs for the 2 training datasets from huggingface.co
using TidierDB\ndb = connect(duckdb())\n\nurls = [\"https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet\",\n \"https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0001.parquet\"];\n
Here, we pass the vector of URLs to db_table
, which will not copy them into memory. Since these datasets are so large, we will also set stream = true
in @collect
to stream the results. If we wanted to read all the files in the folder we could have replace the 0000
with *
(wildcard) db_table(db, \"Path/to/folder/*.parquet\")
Of note, reading these files from URLs is not as rapid as reading them from local files.
@chain db_table(db, urls) begin\n @group_by(horoscope)\n @summarise(count = n(), avg_blog_length = mean(length(text)))\n @arrange(desc(count))\n @aside @show_query _\n @collect(stream = true)\nend\n
Placing @aside @show_query _
before @collect
above lets us see the SQL query and collect it to a local DataFrame at the same time.
SELECT horoscope, COUNT(*) AS count, AVG(length(text)) AS avg_blog_length\n FROM read_parquet(['https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet', 'https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0001.parquet'])\n GROUP BY horoscope\n ORDER BY avg_blog_length DESC\n12\u00d73 DataFrame\n Row \u2502 horoscope count avg_blog_length\n \u2502 String? Int64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Aquarius 49568 1125.83\n 2 \u2502 Cancer 63512 1097.96\n 3 \u2502 Libra 60304 1060.61\n 4 \u2502 Capricorn 49402 1059.56\n 5 \u2502 Sagittarius 50431 1057.46\n 6 \u2502 Leo 58010 1049.6\n 7 \u2502 Taurus 61571 1022.69\n 8 \u2502 Gemini 52925 1020.26\n 9 \u2502 Scorpio 56495 1014.03\n 10 \u2502 Pisces 53812 1011.75\n 11 \u2502 Virgo 64629 996.684\n 12 \u2502 Aries 69134 918.081\n
To learn more about memory efficient queries on larger than RAM files, this blog from DuckDB will help maximize your local db
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/s3viaduckdb/","title":"S3 + DuckDB + TidierDB","text":"TidierDB allows you leverage DuckDB's seamless database integration.
Using DuckDB, you can connect to an AWS or GoogleCloud Database to query directly without making any local copies.
You can also use DBInterface.execute
to set up any DuckDB database connection you need and then use that db to query with TidierDB
using TidierDB\n\n#Connect to Google Cloud via DuckDB\n#google_db = connect(duckdb(), :gbq, access_key=\"string\", secret_key=\"string\")\n\n#Connect to AWS via DuckDB\naws_db = connect(duckdb(), :aws, aws_access_key_id= \"string\",\n aws_secret_access_key= \"string\",\n aws_region=\"us-east-1\")\ns3_csv_path = \"s3://path/to_data.csv\"\n\n@chain db_table(aws_db, s3_csv_path) begin\n @filter(!starts_with(column1, \"M\"))\n @group_by(cyl)\n @summarize(mpg = mean(mpg))\n @mutate(mpg_squared = mpg^2,\n mpg_rounded = round(mpg),\n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\"))\n @filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n @arrange(desc(mpg_rounded))\n @collect\nend\n
2\u00d75 DataFrame\n Row \u2502 cyl mpg mpg_squared mpg_rounded mpg_efficiency\n \u2502 Int64? Float64? Float64? Float64? String?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 27.3444 747.719 27.0 efficient\n 2 \u2502 6 19.7333 389.404 20.0 moderate\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/udfs_ex/","title":"Flexible Syntax and UDFs","text":"TidierDB is unique in its statement parsing flexiblility. This means that using any built in SQL function or user defined functions (or UDFS) or is readily avaialable. To use any function built into a database in @mutate
or in @summarize
, simply correctly write the correctly, but replace '
with \"
. This also applies to any UDF. The example below will illustrate UDFs in the context of DuckDB.
# Set up the connection\nusing TidierDB #rexports DuckDB\ndb = DuckDB.DB()\ncon = DuckDB.connect(db) # this will be important for UDFs\nmtcars_path = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\nmtcars = db_tbable(con, mtcars_path);\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#aggregate-function-in-summarize","title":"aggregate function in @summarize
","text":"Lets use the DuckDB kurtosis
aggregate function
@chain t(mtcars) begin\n @group_by cyl\n @summarize(kurt = kurtosis(mpg))\n @collect\nend\n3\u00d72 DataFrame\n Row \u2502 cyl kurt\n \u2502 Int64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 -1.43411\n 2 \u2502 6 -1.82944\n 3 \u2502 8 0.330061\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#aggregate-functions-in-mutate","title":"aggregate functions in @mutate
","text":"To aggregate sql functions that are builtin to any database, but exist outside of the TidierDB parser, simply wrap the function call in agg()
@chain t(mtcars) begin\n @group_by(cyl)\n @mutate(kurt = agg(kurtosis(mpg)))\n @select cyl mpg kurt\n @collect\nend\n\n32\u00d73 DataFrame\n Row \u2502 cyl mpg kurt\n \u2502 Int64? Float64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 8 18.7 0.330061\n 2 \u2502 8 14.3 0.330061\n 3 \u2502 8 16.4 0.330061\n 4 \u2502 8 17.3 0.330061\n 5 \u2502 8 15.2 0.330061\n 6 \u2502 8 10.4 0.330061\n 7 \u2502 8 10.4 0.330061\n \u22ee \u2502 \u22ee \u22ee \u22ee\n 27 \u2502 6 21.0 -1.82944\n 28 \u2502 6 21.4 -1.82944\n 29 \u2502 6 18.1 -1.82944\n 30 \u2502 6 19.2 -1.82944\n 31 \u2502 6 17.8 -1.82944\n 32 \u2502 6 19.7 -1.82944\n 19 rows omitted\nend\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#duckdb-function-chaining","title":"DuckDB function chaining","text":"In DuckDB, functions can be chained together with .
. TidierDB lets you leverage this.
@chain t(mtcars) begin\n @mutate(model2 = model.upper().string_split(\" \").list_aggr(\"string_agg\",\".\").concat(\".\"))\n @select model model2\n @collect\nend\n32\u00d72 DataFrame\n Row \u2502 model model2\n \u2502 String? String?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 MAZDA.RX4.\n 2 \u2502 Mazda RX4 Wag MAZDA.RX4.WAG.\n 3 \u2502 Datsun 710 DATSUN.710.\n 4 \u2502 Hornet 4 Drive HORNET.4.DRIVE.\n 5 \u2502 Hornet Sportabout HORNET.SPORTABOUT.\n 6 \u2502 Valiant VALIANT.\n 7 \u2502 Duster 360 DUSTER.360.\n \u22ee \u2502 \u22ee \u22ee\n 27 \u2502 Porsche 914-2 PORSCHE.914-2.\n 28 \u2502 Lotus Europa LOTUS.EUROPA.\n 29 \u2502 Ford Pantera L FORD.PANTERA.L.\n 30 \u2502 Ferrari Dino FERRARI.DINO.\n 31 \u2502 Maserati Bora MASERATI.BORA.\n 32 \u2502 Volvo 142E VOLVO.142E.\n 19 rows omitted\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#rowid-and-pseudocolumns","title":"rowid
and pseudocolumns","text":"When a table is not being read directly from a file, rowid
is avaialable for use. In general, TidierDB should support all pseudocolumns.
copy_to(db, mtcars_path, \"mtcars\"); # copying table in for demostration purposes\n@chain db_table(con, :mtcars) begin\n @filter(rowid == 4)\n @select(model:hp)\n @collect\nend\n1\u00d75 DataFrame\n Row \u2502 model mpg cyl disp hp\n \u2502 String? Float64? Int64? Float64? Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Hornet Sportabout 18.7 8 360.0 175\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#udf-sqlite-example","title":"UDF SQLite Example","text":"using SQLite\nsql = connect(sqlite());\ndf = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9],\n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10],\n value = repeat(1:5, 2),\n percent = 0.1:0.1:1.0);\n\ncopy_to(db, sql, \"df_mem\");\nSQLite.@register sql function diff_of_squares(x, y)\n x^2 - y^2\n end;\n\n@chain db_table(sql, \"df_mem\") begin\n @select(value, percent)\n @mutate(plus3 = diff_of_squares(value, percent))\n @collect\nend\n10\u00d73 DataFrame\n Row \u2502 value percent plus3\n \u2502 Int64 Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 0.1 0.99\n 2 \u2502 2 0.2 3.96\n 3 \u2502 3 0.3 8.91\n 4 \u2502 4 0.4 15.84\n 5 \u2502 5 0.5 24.75\n 6 \u2502 1 0.6 0.64\n 7 \u2502 2 0.7 3.51\n 8 \u2502 3 0.8 8.36\n 9 \u2502 4 0.9 15.19\n 10 \u2502 5 1.0 24.0\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#how-to-create-udf-in-duckdb","title":"How to create UDF in DuckDB","text":"Example coming soon..
This page was generated using Literate.jl.
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":""},{"location":"#what-is-tidierdbjl","title":"What is TidierDB.jl?","text":"TiderDB.jl is a 100% Julia implementation of the dbplyr R package, and similar to Python's ibis package.
The main goal of TidierDB.jl is to bring the syntax of Tidier.jl to multiple SQL backends, making it possible to analyze data directly on databases without needing to copy the entire database into memory.
"},{"location":"#currently-supported-backends-include","title":"Currently supported backends include:","text":"DuckDB (default)duckdb()
ClickHouse clickhouse()
SQLite sqlite()
Postgres postgres()
MySQL mysql()
MariaDB mysql()
MSSQL mssql()
Athena athena()
Snowflake snowflake()
Databricks databricks()
Google Big Query gbq()
Oracle oracle()
Change the backend using set_sql_mode()
- for example - set_sql_mode(databricks())
For the stable version:
] add TidierDB\n
TidierDB.jl currently supports:
Category Supported Macros and Functions Data Manipulation@arrange
, @group_by
, @filter
, @select
, @mutate
(supports across
), @summarize
/@summarise
(supports across
), @distinct
Joining @left_join
, @right_join
, @inner_join
, @anti_join
, @full_join
, @semi_join
, @union
, @union_all
Slice and Order @slice_min
, @slice_max
, @slice_sample
, @order
, @window_order
, @window_frame
Utility @show_query
, @collect
, @head
, @count
, show_tables
, @create_view
, drop_view
Helper Functions across
, desc
, if_else
, case_when
, n
, starts_with
, ends_with
, contains
, as_float
, as_integer
, as_string
, is_missing
, missing_if
, replace_missing
TidierStrings.jl Functions str_detect
, str_replace
, str_replace_all
, str_remove_all
, str_remove
TidierDates.jl Functions year
, month
, day
, hour
, min
, second
, floor_date
, difftime
, mdy
, ymd
, dmy
Aggregate Functions mean
, minimum
, maximum
, std
, sum
, cumsum
, cor
, cov
, var
, all aggregate sql fxns @summarize
supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work. @mutate
supports all builtin SQL functions as well.
When using the DuckDB backend, if db_table
recieves a file path ( .parquet
, .json
, .csv
, iceberg
or delta
), it does not copy it into memory. This allows for queries on files too big for memory. db_table
also supports S3 bucket locations via DuckDB.
Typically, you will want to use TidierDB alongside TidierData because there are certain functionality (such as pivoting) which are only supported in TidierData and can only be performed on data frames.
Our recommended path for using TidierDB is to import the package so that there are no namespace conflicts with TidierData. Once TidierDB is integrated with Tidier, then Tidier will automatically load the packages in this fashion.
First, let's develop and execute a query using TidierDB. Notice that all top-level macros and functions originating from TidierDB start with a DB
prefix. Any functions defined within macros do not need to be prefixed within DB
because they are actually pseudofunctions that are in actuality converted into SQL code.
Even though the code reads similarly to TidierData, note that no computational work actually occurs until you run DB.@collect()
, which runs the SQL query and instantiates the result as a DataFrame.
using TidierData\nimport TidierDB as DB\n\ndb = DB.connect(DB.duckdb());\npath_or_name = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\n\n@chain DB.db_table(db, path_or_name) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@collect\nend\n
2\u00d75 DataFrame\n Row \u2502 cyl mpg mpg_squared mpg_rounded mpg_efficiency \n \u2502 Int64? Float64? Float64? Float64? String? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 27.3444 747.719 27.0 efficient\n 2 \u2502 6 19.7333 389.404 20.0 moderate\n
"},{"location":"#what-if-we-wanted-to-pivot-the-result","title":"What if we wanted to pivot the result?","text":"We cannot do this using TidierDB. However, we can call @pivot_longer()
from TidierData after the result of the query has been instantiated as a DataFrame, like this:
@chain DB.db_table(db, path_or_name) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@collect\n @pivot_longer(everything(), names_to = \"variable\", values_to = \"value\")\nend\n
10\u00d72 DataFrame\n Row \u2502 variable value \n \u2502 String Any \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 cyl 4\n 2 \u2502 cyl 6\n 3 \u2502 mpg 27.3444\n 4 \u2502 mpg 19.7333\n 5 \u2502 mpg_squared 747.719\n 6 \u2502 mpg_squared 389.404\n 7 \u2502 mpg_rounded 27.0\n 8 \u2502 mpg_rounded 20.0\n 9 \u2502 mpg_efficiency efficient\n 10 \u2502 mpg_efficiency moderate\n
"},{"location":"#what-sql-query-does-tidierdb-generate-for-a-given-piece-of-julia-code","title":"What SQL query does TidierDB generate for a given piece of Julia code?","text":"We can replace DB.collect()
with DB.@show_query
to reveal the underlying SQL query being generated by TidierDB. To handle complex queries, TidierDB makes heavy use of Common Table Expressions (CTE), which are a useful tool to organize long queries.
@chain DB.db_table(db, path_or_name) begin\n DB.@filter(!starts_with(model, \"M\"))\n DB.@group_by(cyl)\n DB.@summarize(mpg = mean(mpg))\n DB.@mutate(mpg_squared = mpg^2, \n mpg_rounded = round(mpg), \n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\")) \n DB.@filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n DB.@arrange(desc(mpg_rounded))\n DB.@show_query\nend\n
WITH cte_1 AS (\nSELECT *\n FROM mtcars\n WHERE NOT (starts_with(model, 'M'))),\ncte_2 AS (\nSELECT cyl, AVG(mpg) AS mpg\n FROM cte_1\n GROUP BY cyl),\ncte_3 AS (\nSELECT cyl, mpg, POWER(mpg, 2) AS mpg_squared, ROUND(mpg) AS mpg_rounded, CASE WHEN mpg >= POWER(cyl, 2) THEN 'efficient' WHEN mpg < 15.2 THEN 'inefficient' ELSE 'moderate' END AS mpg_efficiency\n FROM cte_2 ),\ncte_4 AS (\nSELECT *\n FROM cte_3\n WHERE mpg_efficiency in ('moderate', 'efficient')) \nSELECT *\n FROM cte_4 \n ORDER BY mpg_rounded DESC\n
"},{"location":"#tidierdb-is-already-quite-fully-featured-supporting-advanced-tidierdata-functions-like-across-for-multi-column-selection","title":"TidierDB is already quite fully-featured, supporting advanced TidierData functions like across()
for multi-column selection.","text":"@chain DB.db_table(db, path_or_name) begin\n DB.@group_by(cyl)\n DB.@summarize(across((starts_with(\"a\"), ends_with(\"s\")), (mean, sum)))\n DB.@collect\nend\n
3\u00d75 DataFrame\n Row \u2502 cyl am_mean vs_mean am_sum vs_sum \n \u2502 Int64? Float64? Float64? Int128? Int128? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 0.727273 0.909091 8 10\n 2 \u2502 6 0.428571 0.571429 3 4\n 3 \u2502 8 0.142857 0.0 2 0\n
Bang bang !!
interpolation for columns and values is also supported.
There are a few subtle but important differences from Tidier.jl outlined here.
"},{"location":"#missing-a-function-or-backend","title":"Missing a function or backend?","text":"You can use any existing SQL function within @mutate
with the correct SQL syntax and it should just work.
But if you run into problems please open an issue, and we will be happy to take a look!
"},{"location":"reference/","title":"Reference","text":""},{"location":"reference/#index","title":"Index","text":"TidierDB.connect
TidierDB.copy_to
TidierDB.db_table
TidierDB.show_tables
TidierDB.warnings
TidierDB.@anti_join
TidierDB.@arrange
TidierDB.@collect
TidierDB.@count
TidierDB.@create_view
TidierDB.@distinct
TidierDB.@filter
TidierDB.@full_join
TidierDB.@group_by
TidierDB.@head
TidierDB.@inner_join
TidierDB.@left_join
TidierDB.@mutate
TidierDB.@rename
TidierDB.@right_join
TidierDB.@select
TidierDB.@semi_join
TidierDB.@slice_max
TidierDB.@slice_min
TidierDB.@slice_sample
TidierDB.@summarise
TidierDB.@summarize
TidierDB.@union
TidierDB.@union_all
TidierDB.@window_frame
TidierDB.@window_order
# TidierDB.connect
\u2014 Method.
connect(backend; kwargs...)\n
This function establishes a database connection based on the specified backend and connection parameters and sets the SQL mode
Arguments
backend
: type specifying the database backend to connect to. Supported backends are:
duckdb()
, sqlite()
(SQLite), mssql()
, mysql()
(for MariaDB and MySQL), clickhouse()
, postgres()
kwargs
: Keyword arguments specifying the connection parameters for the selected backend. The required parameters vary depending on the backend:
MySQL:
host
: The host name or IP address of the MySQL server. Default is \"localhost\".user
: The username for authentication. Default is an empty string.password
: The password for authentication.db
: The name of the database to connect to (optional).port
: The port number of the MySQL server (optional).Returns
Examples
# Connect to MySQL\n# conn = connect(mysql(); host=\"localhost\", user=\"root\", password=\"password\", db=\"mydb\")\n# Connect to PostgreSQL using LibPQ\n# conn = connect(postgres(); host=\"localhost\", dbname=\"mydb\", user=\"postgres\", password=\"password\")\n# Connect to ClickHouse\n# conn = connect(clickhouse(); host=\"localhost\", port=9000, database=\"mydb\", user=\"default\", password=\"\")\n# Connect to SQLite\n# conn = connect(sqlite())\n# Connect to Google Big Query\n# conn = connect(gbq(), \"json_user_key_path\", \"location\")\n# Connect to Snowflake\n# conn = connect(snowflake(), \"ac_id\", \"token\", \"Database_name\", \"Schema_name\", \"warehouse_name\")\n# Connect to Microsoft SQL Server\n# conn = connect(mssql(), \"DRIVER={ODBC Driver 18 for SQL Server};SERVER=host,1433;UID=sa;PWD=YourPassword;Encrypt=no;TrustServerCertificate=yes\")\n# Connect to DuckDB\n# connect to Google Cloud via DuckDB\n# google_db = connect(duckdb(), :gbq, access_key=\"string\", secret_key=\"string\")\n# Connect to AWS via DuckDB\n# aws_db = connect2(duckdb(), :aws, aws_access_key_id=get(ENV, \"AWS_ACCESS_KEY_ID\", \"access_key\"), aws_secret_access_key=get(ENV, \"AWS_SECRET_ACCESS_KEY\", \"secret_access key\"), aws_region=get(ENV, \"AWS_DEFAULT_REGION\", \"us-east-1\"))\n# Connect to MotherDuck\n# connect(duckdb(), \"\"md://...\"\") for first connection, vs connect(duckdb(), \"md:\") for reconnection\n# Connect to exisiting database file\n# connect(duckdb(), \"path/to/database.duckdb\")\n# Open an in-memory database\njulia> db = connect(duckdb())\nDuckDB.Connection(\":memory:\")\n
source
# TidierDB.copy_to
\u2014 Method.
copy_to(conn, df_or_path, \"name\")\n
Allows user to copy a df to the database connection. Currently supports DuckDB, SQLite, MySql
Arguments
-conn
: the database connection -df
: dataframe to be copied or path to serve as source. With DuckDB, path supports .csv, .json, .parquet to be used without copying intermediary df. -name
: name as string for the database to be used
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"test\");\n
source
# TidierDB.db_table
\u2014 Function.
db_table(database, table_name, athena_params, delta = false, iceberg = false)\n
db_table
starts the underlying SQL query struct, adding the metadata and table. If paths are passed directly to db*table instead of a name it will not copy it to memory, but rather ready directly from the file. db*table
only supports direct file paths to a table. It does not support database file paths such asdbname.duckdb
ordbname.sqlite
. Such files must be used withconnect first
Arguments
database
: The Database or connection objecttable_name
: tablename as a string (name, local path, or URL). - CSV/TSV - Parquet - Json - Iceberg - Delta - S3 tables from AWS or Google Cloud
*
wildcards to read all files of a type in a location such as:db_table(db, \"Path/to/testing_files/*.parquet\")
delta
: must be true to read delta filesiceberg
: must be true to read iceberg finalize_ctesExample
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> db_table(db, \"df_mem\")\nTidierDB.SQLQuery(\"\", \"df_mem\", \"\", \"\", \"\", \"\", \"\", \"\", false, false, 4\u00d74 DataFrame\n Row \u2502 name type current_selxn table_name \n \u2502 String? String? Int64 String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 id VARCHAR 1 df_mem\n 2 \u2502 groups VARCHAR 1 df_mem\n 3 \u2502 value BIGINT 1 df_mem\n 4 \u2502 percent DOUBLE 1 df_mem, false, DuckDB.Connection(\":memory:\"), TidierDB.CTE[], 0, nothing)\n
source
# TidierDB.show_tables
\u2014 Method.
show_tables(con; GBQ_datasetname)\n
Shows tables available in database. currently supports DuckDB, databricks, Snowflake, GBQ, SQLite, LibPQ
Arguments
con
: connection to backendGBQ_datasetname
: string of dataset nameExamples
julia> db = connect(duckdb());\n\njulia> show_tables(db);\n
source
# TidierDB.warnings
\u2014 Method.
warnings(show::Bool)\n
Sets the global warning flag to the specified boolean value.
Arguments
flag::Bool
: A boolean value to set the warning flag. If true
, warnings will be enabled; if false
, warnings will be disabled.Default Behavior
By default, the warning flag is set to false
, meaning that warnings are disabled unless explicitly enabled by setting this function with true
.
Example
julia> warnings(true);\n
source
# TidierDB.@anti_join
\u2014 Macro.
@anti_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform an anti join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @anti_join(\"df_join\", id = id2)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AB aa 2 0.2\n 2 \u2502 AD aa 4 0.4\n 3 \u2502 AF aa 1 0.6\n 4 \u2502 AH aa 3 0.8\n 5 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@arrange
\u2014 Macro.
@arrange(sql_query, columns...)\n
Order SQL table rows based on specified column(s).
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to order the rows by. Can include multiple columns for nested sorting. Wrap column name with desc()
for descending order.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @arrange(value, desc(percent))\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AF aa 1 0.6\n 2 \u2502 AA bb 1 0.1\n 3 \u2502 AG bb 2 0.7\n 4 \u2502 AB aa 2 0.2\n 5 \u2502 AH aa 3 0.8\n 6 \u2502 AC bb 3 0.3\n 7 \u2502 AI bb 4 0.9\n 8 \u2502 AD aa 4 0.4\n 9 \u2502 AJ aa 5 1.0\n 10 \u2502 AE bb 5 0.5\n
source
# TidierDB.@collect
\u2014 Macro.
@collect(sql_query, stream = false)\n
db_table
starts the underlying SQL query struct, adding the metadata and table.
Arguments
sql_query
: The SQL query to operate on.stream
: optional streaming for query/execution of results when using duck db. Defaults to falseExample
julia> db = connect(duckdb());\n\njulia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @collect db_table(db, \"df_mem\")\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@count
\u2014 Macro.
@count(sql_query, columns...)\n
Count the number of rows grouped by specified column(s).
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to group by before counting. If no columns are specified, counts all rows in the query.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @count(groups)\n @arrange(groups)\n @collect\n end\n2\u00d72 DataFrame\n Row \u2502 groups count \n \u2502 String Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 5\n 2 \u2502 bb 5\n
source
# TidierDB.@create_view
\u2014 Macro.
@view(sql_query, name, replace = true)\n
Create a view from a SQL query. Currently supports DuckDB, MySQL, GBQ, Postgres
Arguments
sql_query
: The SQL query to create a view from.name
: The name of the view to create.replace
: defaults to true if view should be replacedExamples
julia> db = connect(duckdb());\n\njulia> df = DataFrame(id = [1, 2, 3], value = [10, 20, 30]);\n\njulia> copy_to(db, df, \"df1\");\n\njulia> @chain db_table(db, \"df1\") @create_view(viewer);\n\njulia> db_table(db, \"viewer\")\nTidierDB.SQLQuery(\"\", \"viewer\", \"\", \"\", \"\", \"\", \"\", \"\", false, false, 2\u00d74 DataFrame\n Row \u2502 name type current_selxn table_name \n \u2502 String String Int64 String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 id BIGINT 1 viewer\n 2 \u2502 value BIGINT 1 viewer, false, DuckDB.DB(\":memory:\"), TidierDB.CTE[], 0, nothing, \"\", \"\", 0)\n
source
# TidierDB.@distinct
\u2014 Macro.
@distinct(sql_query, columns...)\n
Select distinct rows based on specified column(s). Distinct works differently in TidierData vs SQL and therefore TidierDB. Distinct will also select only the only columns it is given (or all if given none)
Arguments
sql_query
: The SQL query to operate on. columns
: Columns to determine uniqueness. If no columns are specified, all columns are used to identify distinct rows.
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @distinct(value)\n @arrange(value)\n @collect\n end\n5\u00d71 DataFrame\n Row \u2502 value \n \u2502 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1\n 2 \u2502 2\n 3 \u2502 3\n 4 \u2502 4\n 5 \u2502 5\n\njulia> @chain db_table(db, :df_mem) begin\n @distinct\n @arrange(id)\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@filter
\u2014 Macro.
@filter(sql_query, conditions...)\n
Filter rows in a SQL table based on specified conditions.
Arguments
sql_query
: The SQL query to filter rows from.conditions
: Expressions specifying the conditions that rows must satisfy to be included in the output. Rows for which the expression evaluates to true
will be included in the result. Multiple conditions can be combined using logical operators (&&
, ||
). It will automatically detect whether the conditions belong in WHERE vs HAVING.
Temporarily, it is best to use begin and end when filtering multiple conditions. (ex 2 below)\n
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @filter(percent > .5)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AF aa 1 0.6\n 2 \u2502 AG bb 2 0.7\n 3 \u2502 AH aa 3 0.8\n 4 \u2502 AI bb 4 0.9\n 5 \u2502 AJ aa 5 1.0\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(mean = mean(percent))\n @filter begin \n groups == \"bb\" || # logical operators can still be used like this\n mean > .5\n end\n @arrange(groups)\n @collect\n end\n2\u00d72 DataFrame\n Row \u2502 groups mean \n \u2502 String Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 0.6\n 2 \u2502 bb 0.5\n
source
# TidierDB.@full_join
\u2014 Macro.
@inner_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform an full join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @full_join((@chain db_table(db, \"df_join\") @filter(score > 70)), id)\n #@aside @show_query _\n @collect\n end\n11\u00d77 DataFrame\n Row \u2502 id groups value percent id_1 category score \n \u2502 String? String? Int64? Float64? String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 AB aa 2 0.2 missing missing missing \n 7 \u2502 AD aa 4 0.4 missing missing missing \n 8 \u2502 AF aa 1 0.6 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n 11 \u2502 missing missing missing missing AM X 74\n
source
# TidierDB.@group_by
\u2014 Macro.
@group_by(sql_query, columns...)\n
Group SQL table rows by specified column(s). If grouping is performed as a terminal operation without a subsequent mutatation or summarization (as in the example below), then the resulting data frame will be ungrouped when @collect
is applied.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions specifying the columns to group by. Columns can be specified by name.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @arrange(groups)\n @collect\n end\n2\u00d71 DataFrame\n Row \u2502 groups \n \u2502 String \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa\n 2 \u2502 bb\n
source
# TidierDB.@head
\u2014 Macro.
@head(sql_query, value)\n
Limit SQL table number of rows returned based on specified value. LIMIT
in SQL
Arguments
sql_query
: The SQL query to operate on.value
: Number to limit how many rows are returned. If left empty, it will default to 6 rowsExamples
julia> db = connect(duckdb());\n\njulia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> copy_to(db, df, \"df_mem\"); \n\njulia> @chain db_table(db, :df_mem) begin\n @head(1) ## supports expressions ie `3-2` would return the same df below\n @collect\n end\n1\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n
source
# TidierDB.@inner_join
\u2014 Macro.
@inner_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform an inner join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @inner_join(\"df_join\", \"id\" = id2)\n @collect\n end\n5\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String String Int64 Float64 String String Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n
source
# TidierDB.@left_join
\u2014 Macro.
@left_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform a left join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, \"df_mem\") begin\n @left_join(\"df_join\", \"id\" = \"id2\" )\n @collect\n end\n10\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String String Int64 Float64 String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 AB aa 2 0.2 missing missing missing \n 7 \u2502 AD aa 4 0.4 missing missing missing \n 8 \u2502 AF aa 1 0.6 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n\njulia> query = @chain db_table(db, \"df_join\") begin\n @filter(score > 85) # only show scores above 85 in joining table\n end;\n\njulia> @chain db_table(db, \"df_mem\") begin\n @left_join(t(query), id = id2)\n @collect\n end\n10\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String String Int64 Float64 String? String? Int64? \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AI bb 4 0.9 AI X 95\n 4 \u2502 AB aa 2 0.2 missing missing missing \n 5 \u2502 AD aa 4 0.4 missing missing missing \n 6 \u2502 AE bb 5 0.5 missing missing missing \n 7 \u2502 AF aa 1 0.6 missing missing missing \n 8 \u2502 AG bb 2 0.7 missing missing missing \n 9 \u2502 AH aa 3 0.8 missing missing missing \n 10 \u2502 AJ aa 5 1.0 missing missing missing \n
source
# TidierDB.@mutate
\u2014 Macro.
@mutate(sql_query, exprs...; _by)\n
Mutate SQL table rows by adding new columns or modifying existing ones.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions for mutating the table. New columns can be added or existing columns modified using column_name = expression syntax, where expression can involve existing columns._by
: optional argument that supports single column names, or vectors of columns to allow for grouping for the transformation in the macro callExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @mutate(value = value * 4, new_col = percent^2)\n @collect\n end\n10\u00d75 DataFrame\n Row \u2502 id groups value percent new_col \n \u2502 String String Int64 Float64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 4 0.1 0.01\n 2 \u2502 AB aa 8 0.2 0.04\n 3 \u2502 AC bb 12 0.3 0.09\n 4 \u2502 AD aa 16 0.4 0.16\n 5 \u2502 AE bb 20 0.5 0.25\n 6 \u2502 AF aa 4 0.6 0.36\n 7 \u2502 AG bb 8 0.7 0.49\n 8 \u2502 AH aa 12 0.8 0.64\n 9 \u2502 AI bb 16 0.9 0.81\n 10 \u2502 AJ aa 20 1.0 1.0\n\njulia> @chain db_table(db, :df_mem) begin\n @mutate(max = maximum(percent), sum = sum(percent), _by = groups)\n @collect\n end\n10\u00d76 DataFrame\n Row \u2502 id groups value percent max sum \n \u2502 String String Int64 Float64 Float64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AB aa 2 0.2 1.0 3.0\n 2 \u2502 AD aa 4 0.4 1.0 3.0\n 3 \u2502 AF aa 1 0.6 1.0 3.0\n 4 \u2502 AH aa 3 0.8 1.0 3.0\n 5 \u2502 AJ aa 5 1.0 1.0 3.0\n 6 \u2502 AA bb 1 0.1 0.9 2.5\n 7 \u2502 AC bb 3 0.3 0.9 2.5\n 8 \u2502 AE bb 5 0.5 0.9 2.5\n 9 \u2502 AG bb 2 0.7 0.9 2.5\n 10 \u2502 AI bb 4 0.9 0.9 2.5\n
source
# TidierDB.@rename
\u2014 Macro.
@rename(sql_query, renamings...)\n
Rename one or more columns in a SQL query.
Arguments
-sql_query
: The SQL query to operate on. -renamings
: One or more pairs of old and new column names, specified as new name = old name
Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @rename(new_name = percent)\n @collect\n end\n10\u00d74 DataFrame\n Row \u2502 id groups value new_name \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AB aa 2 0.2\n 3 \u2502 AC bb 3 0.3\n 4 \u2502 AD aa 4 0.4\n 5 \u2502 AE bb 5 0.5\n 6 \u2502 AF aa 1 0.6\n 7 \u2502 AG bb 2 0.7\n 8 \u2502 AH aa 3 0.8\n 9 \u2502 AI bb 4 0.9\n 10 \u2502 AJ aa 5 1.0\n
source
# TidierDB.@right_join
\u2014 Macro.
@right_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform a right join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @right_join(\"df_join\", id = id2)\n @collect\n end\n7\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String String Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 missing missing missing missing AK Y 68\n 7 \u2502 missing missing missing missing AM X 74\n\njulia> query = @chain db_table(db, \"df_join\") begin\n @filter(score >= 74) # only show scores above 85 in joining table\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @right_join(t(query), id = id2)\n @collect\n end\n6\u00d77 DataFrame\n Row \u2502 id groups value percent id2 category score \n \u2502 String? String? Int64? Float64? String String Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 AA X 88\n 2 \u2502 AC bb 3 0.3 AC Y 92\n 3 \u2502 AE bb 5 0.5 AE X 77\n 4 \u2502 AG bb 2 0.7 AG Y 83\n 5 \u2502 AI bb 4 0.9 AI X 95\n 6 \u2502 missing missing missing missing AM X 74\n
source
# TidierDB.@select
\u2014 Macro.
@select(sql_query, columns)\n
Select specified columns from a SQL table.
Arguments
sql_query
: The SQL query to select columns from.columns
: Expressions specifying the columns to select. Columns can be specified by name, and new columns can be created with expressions using existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> df_mem = db_table(db, :df_mem);\n\njulia> @chain t(df_mem) begin\n @select(groups:percent)\n @collect\n end\n10\u00d73 DataFrame\n Row \u2502 groups value percent \n \u2502 String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 bb 1 0.1\n 2 \u2502 aa 2 0.2\n 3 \u2502 bb 3 0.3\n 4 \u2502 aa 4 0.4\n 5 \u2502 bb 5 0.5\n 6 \u2502 aa 1 0.6\n 7 \u2502 bb 2 0.7\n 8 \u2502 aa 3 0.8\n 9 \u2502 bb 4 0.9\n 10 \u2502 aa 5 1.0\n\njulia> @chain t(df_mem) begin\n @select(contains(\"e\"))\n @collect\n end\n10\u00d72 DataFrame\n Row \u2502 value percent \n \u2502 Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 0.1\n 2 \u2502 2 0.2\n 3 \u2502 3 0.3\n 4 \u2502 4 0.4\n 5 \u2502 5 0.5\n 6 \u2502 1 0.6\n 7 \u2502 2 0.7\n 8 \u2502 3 0.8\n 9 \u2502 4 0.9\n 10 \u2502 5 1.0\n
source
# TidierDB.@semi_join
\u2014 Macro.
@semi_join(sql_query, join_table, orignal_table_col = new_table_col)\n
Perform an semi join between two SQL queries based on a specified condition. This syntax here is slightly different than TidierData.jl, however, because SQL does not drop the joining column, for the metadata storage, it is preferrable for the names to be different
Arguments
sql_query
: The primary SQL query to operate on.join_table
: The secondary SQL table to join with the primary query table.orignal_table_col
: Column from the original table that matches for join. Accepts cols as bare column names or stringsnew_table_col
: Column from the new table that matches for join. Accepts cols as bare column names or stringsExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> df2 = DataFrame(id2 = [\"AA\", \"AC\", \"AE\", \"AG\", \"AI\", \"AK\", \"AM\"],\n category = [\"X\", \"Y\", \"X\", \"Y\", \"X\", \"Y\", \"X\"],\n score = [88, 92, 77, 83, 95, 68, 74]);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> copy_to(db, df2, \"df_join\");\n\njulia> @chain db_table(db, :df_mem) begin\n @semi_join(\"df_join\", id = id2)\n @collect\n end\n5\u00d74 DataFrame\n Row \u2502 id groups value percent \n \u2502 String String Int64 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1\n 2 \u2502 AC bb 3 0.3\n 3 \u2502 AE bb 5 0.5\n 4 \u2502 AG bb 2 0.7\n 5 \u2502 AI bb 4 0.9\n
source
# TidierDB.@slice_max
\u2014 Macro.
@slice_max(sql_query, column, n = 1)\n
Select rows with the largest values in specified column. This will always return ties.
Arguments
sql_query
: The SQL query to operate on.column
: Column to identify the smallest values.n
: The number of rows to select with the largest values for each specified column. Default is 1, which selects the row with the smallest value.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_max(value, n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_max(value)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 id groups value percent rank_col \n \u2502 String String Int64 Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AE bb 5 0.5 1\n 2 \u2502 AJ aa 5 1.0 1\n
source
# TidierDB.@slice_min
\u2014 Macro.
@slice_min(sql_query, column, n = 1)\n
Select rows with the smallest values in specified column. This will always return ties.
Arguments
sql_query
: The SQL query to operate on.column
: Column to identify the smallest values.n
: The number of rows to select with the smallest values for each specified column. Default is 1, which selects the row with the smallest value.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_min(value, n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_min(value)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 id groups value percent rank_col \n \u2502 String String Int64 Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 AA bb 1 0.1 1\n 2 \u2502 AF aa 1 0.6 1\n
source
# TidierDB.@slice_sample
\u2014 Macro.
@slice_sample(sql_query, n)\n
Randomly select a specified number of rows from a SQL table.
Arguments
sql_query
: The SQL query to operate on.n
: The number of rows to randomly select.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @slice_sample(n = 2)\n @collect\n end;\n\njulia> @chain db_table(db, :df_mem) begin\n @slice_sample()\n @collect\n end;\n
source
# TidierDB.@summarise
\u2014 Macro.
@summarise(sql_query, exprs...)\n
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(across((value:percent), (mean, sum)))\n @arrange(groups)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 groups value_mean percent_mean value_sum percent_sum \n \u2502 String Float64 Float64 Int128 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 0.6 15 3.0\n 2 \u2502 bb 3.0 0.5 15 2.5\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(test = sum(percent), n = n())\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n
source
# TidierDB.@summarize
\u2014 Macro.
@summarize(sql_query, exprs...; _by)\n
Aggregate and summarize specified columns of a SQL table.
Arguments
sql_query
: The SQL query to operate on.exprs
: Expressions defining the aggregation and summarization operations. These can specify simple aggregations like mean, sum, and count, or more complex expressions involving existing column values._by
: optional argument that supports single column names, or vectors of columns to allow for grouping for the aggregatation in the macro callExamples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(across((ends_with(\"e\"), starts_with(\"p\")), (mean, sum)))\n @arrange(groups)\n @collect\n end\n2\u00d75 DataFrame\n Row \u2502 groups value_mean percent_mean value_sum percent_sum \n \u2502 String Float64 Float64 Int128 Float64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 0.6 15 3.0\n 2 \u2502 bb 3.0 0.5 15 2.5\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarise(test = sum(percent), n = n())\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n\njulia> @chain db_table(db, :df_mem) begin\n @summarise(test = sum(percent), n = n(), _by = groups)\n @arrange(groups)\n @collect\n end\n2\u00d73 DataFrame\n Row \u2502 groups test n \n \u2502 String Float64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 aa 3.0 5\n 2 \u2502 bb 2.5 5\n
source
# TidierDB.@union
\u2014 Macro.
@union(sql_query1, sql_query2)\n
Combine two SQL queries using the UNION
operator.
Arguments
sql_query1
: The first SQL query to combine.sql_query2
: The second SQL query to combine.Returns
Examples
julia> db = connect(duckdb());\n\njulia> df1 = DataFrame(id = [1, 2, 3], value = [10, 20, 30]);\n\njulia> df2 = DataFrame(id = [4, 5, 6], value = [40, 50, 60]);\n\njulia> copy_to(db, df1, \"df1\");\n\njulia> copy_to(db, df2, \"df2\");\n\njulia> df1_table = db_table(db, \"df1\");\n\njulia> df2_table = db_table(db, \"df2\");\n\njulia> @chain t(df1_table) @union(df2_table) @collect\n6\u00d72 DataFrame\n Row \u2502 id value \n \u2502 Int64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 10\n 2 \u2502 2 20\n 3 \u2502 3 30\n 4 \u2502 4 40\n 5 \u2502 5 50\n 6 \u2502 6 60\n\njulia> query = @chain t(df2_table) @filter(value == 50);\n\njulia> @chain t(df1_table) begin \n @union(t(query))\n @collect\n end\n4\u00d72 DataFrame\n Row \u2502 id value \n \u2502 Int64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 10\n 2 \u2502 2 20\n 3 \u2502 3 30\n 4 \u2502 5 50\n\njulia> @chain t(df1_table) begin \n @union(t(df1_table))\n @collect\n end\n3\u00d72 DataFrame\n Row \u2502 id value \n \u2502 Int64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 10\n 2 \u2502 2 20\n 3 \u2502 3 30\n
source
# TidierDB.@window_frame
\u2014 Macro.
@window_frame(sql_query, args...)\n
Define the window frame for window functions in a SQL query, specifying the range of rows to include in the calculation relative to the current row.
Arguments
sqlquery::SQLQuery
: The SQLQuery instance to which the window frame will be applied.args...
: A variable number of arguments specifying the frame boundaries. These can be:
from
: The starting point of the frame. Can be a positive or negative integer, 0 or empty. When empty, it will use UNBOUNDEDto
: The ending point of the frame. Can be a positive or negative integer, 0 or empty. When empty, it will use UNBOUNDEDto
or from
it will default to from, and to will be UNBOUNDED.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> df_mem = db_table(db, :df_mem);\n\njulia> @chain t(df_mem) begin\n @group_by groups\n @window_frame(3)\n @mutate(avg = mean(percent))\n #@show_query\n end;\n\njulia> @chain t(df_mem) begin\n @group_by groups\n @window_frame(-3, 3)\n @mutate(avg = mean(percent))\n #@show_query\n end;\n\njulia> @chain t(df_mem) begin\n @group_by groups\n # @window_frame(to = -3)\n @mutate(avg = mean(percent))\n #@show_query\n @collect\n end;\n\njulia> @chain t(df_mem) begin\n @group_by groups\n @window_frame()\n @mutate(avg = mean(percent))\n #@show_query\n end;\n
source
# TidierDB.@window_order
\u2014 Macro.
@window_order(sql_query, columns...)\n
Specify the order of rows for window functions within a SQL query.
Arguments
sql_query
: The SQL query to operate on.columns
: Columns to order the rows by for the window function. Can include multiple columns for nested sorting. Prepend a column name with - for descending order.Examples
julia> df = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9], \n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10], \n value = repeat(1:5, 2), \n percent = 0.1:0.1:1.0);\n\njulia> db = connect(duckdb());\n\njulia> copy_to(db, df, \"df_mem\");\n\njulia> @chain db_table(db, :df_mem) begin\n @group_by groups\n @window_frame(3)\n @window_order(desc(percent))\n @mutate(avg = mean(value))\n #@show_query \n end;\n
source
"},{"location":"reference/#reference-internal-functions","title":"Reference - Internal functions","text":"# TidierDB.@union_all
\u2014 Macro.
@union(sql_query1, sql_query2)\n
Combine two SQL queries using the UNION ALL
operator.
Arguments
sql_query1
: The first SQL query to combine.sql_query2
: The second SQL query to combine.Returns
Examples
julia> db = connect(duckdb());\n\njulia> df1 = DataFrame(id = [1, 2, 3], value = [10, 20, 30]);\n\njulia> copy_to(db, df1, \"df1\");\n\njulia> df1_table = db_table(db, \"df1\");\n\njulia> @chain t(df1_table) @union_all(df1_table) @collect\n6\u00d72 DataFrame\n Row \u2502 id value \n \u2502 Int64 Int64 \n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 10\n 2 \u2502 2 20\n 3 \u2502 3 30\n 4 \u2502 1 10\n 5 \u2502 2 20\n 6 \u2502 3 30\n
source
"},{"location":"examples/generated/UserGuide/Snowflake/","title":"Using Snowflake","text":"Establishing a connection with the Snowflake SQL Rest API requires a OAuth token specific to the Role the user will use to query tables with.
"},{"location":"examples/generated/UserGuide/Snowflake/#connecting","title":"Connecting","text":"Connection is established with the connect
function as shown below. Connection requires 5 items as strings
Two things to note:
Since each time db_table
runs, it runs a query to pull the metadata, you may choose to use run db_table
and save the results, and use these results withfrom_query()
@show_query
even if the OAuthtoken has expired. To @collect
you will have to reconnect and rerun dbtable if your OAuth token has expiredset_sql_mode(snowflake())\nac_id = \"string_id\"\ntoken = \"OAuth_token_string\"\ncon = connect(:snowflake, ac_id, token, \"DEMODB\", \"PUBLIC\", \"COMPUTE_WH\")\n# After connection is established, a you may begin querying.\nstable_table_metadata = db_table(con, \"MTCARS\")\n@chain from_query(stable_table_metadata) begin\n @select(WT)\n @mutate(TEST = WT *2)\n #@aside @show_query _\n @collect\nend\n
32\u00d72 DataFrame\n Row \u2502 WT TEST\n \u2502 Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 2.62 5.24\n 2 \u2502 2.875 5.75\n 3 \u2502 2.32 4.64\n 4 \u2502 3.215 6.43\n \u22ee \u2502 \u22ee \u22ee\n 29 \u2502 3.17 6.34\n 30 \u2502 2.77 5.54\n 31 \u2502 3.57 7.14\n 32 \u2502 2.78 5.56\n 24 rows omitted\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/athena/","title":"Using Athena","text":"To use the Athena AWS backend with TidierDB, set up and a small syntax difference are covered here.
"},{"location":"examples/generated/UserGuide/athena/#connecting","title":"Connecting","text":"Connection is established through AWS.jl as shwon below.
using TidierDB, AWS\nset_sql_mode(athena())\n# Replace your credentials as needed below\naws_access_key_id = get(ENV,\"AWS_ACCESS_KEY_ID\",\"key\")\naws_secret_access_key = get(ENV, \"AWS_SECRET_ACCESS_KEY\",\"secret_key\")\naws_region = get(ENV,\"AWS_DEFAULT_REGION\",\"region\")\n\nconst AWS_GLOBAL_CONFIG = Ref{AWS.AWSConfig}()\ncreds = AWSCredentials(aws_access_key_id, aws_secret_access_key)\n\nAWS_GLOBAL_CONFIG[] = AWS.global_aws_config(region=aws_region, creds=creds)\n\ncatalog = \"AwsDataCatalog\"\nworkgroup = \"primary\"\ndb = \"demodb\"\nall_results = true\nresults_per_increment = 10\nout_loc = \"s3://location/\"\n\nathena_params = Dict(\n \"ResultConfiguration\" => Dict(\n \"OutputLocation\" => out_loc\n ),\n \"QueryExecutionContext\" => Dict(\n \"Database\" => db,\n \"Catalog\" => catalog\n ),\n \"Workgroup\" => workgroup\n)\n
"},{"location":"examples/generated/UserGuide/athena/#db_table-differences","title":"db_table
differences","text":"There are two differences for db_table
which are seen in the query below
\"demodb.table_name
db_table
requires a third argument: the athena_params from above.from_query
with Athena to reduce number of queries","text":"Throughout TidierDB, each time db_table
is called, it queries the databases to get the metadata. Consider how AWS Athena logs queries, a user may want to reduce the number of queries. This can be done saving the results of db_table
, and then using from_query with those results for furthe queries as shown below.
mtcars = db_table(AWS_GLOBAL_CONFIG[], \"demodb.mtcars\", athena_params)\n@chain from_query(mtcars) begin\n @filter(cyl > 4)\n @group_by(cyl)\n @summarize(mpg = mean(mpg))\n #@show_query\n @collect\nend\n
2\u00d72 DataFrame\n Row \u2502 cyl mpg\n \u2502 Int64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 6 19.7429\n 2 \u2502 8 15.1\n
I would like to acknowledge the work of Manu Francis and this blog post, which helped guide this process
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/databricks/","title":"Using Databricks","text":"Establishing a connection with the Databricks SQL Rest API requires a token.
"},{"location":"examples/generated/UserGuide/databricks/#connecting","title":"Connecting","text":"Connection is established with the connect
function as shown below. Connection requires 5 items as strings
One thing to note, Since each time db_table
runs, it runs a query to pull the metadata, you may choose to use run db_table
and save the results, and use these results with from_query()
. This will reduce the number of queries to your database and is illustrated below.
set_sql_mode(databricks())\ninstance_id = \"string_id\"\ntoken \"string_token\"\nwarehouse_id = \"e673cd4f387f964a\"\ncon = connect(:databricks, instance_id, token, \"DEMODB\", \"PUBLIC\", warehouse_id)\n# After connection is established, a you may begin querying.\nstable_table_metadata = db_table(con, \"mtcars\")\n@chain from_query(stable_table_metadata) begin\n @select(wt)\n @mutate(test = wt *2)\n #@aside @show_query _\n @collect\nend\n
32\u00d72 DataFrame\n Row \u2502 wt test\n \u2502 Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 2.62 5.24\n 2 \u2502 2.875 5.75\n 3 \u2502 2.32 4.64\n 4 \u2502 3.215 6.43\n \u22ee \u2502 \u22ee \u22ee\n 29 \u2502 3.17 6.34\n 30 \u2502 2.77 5.54\n 31 \u2502 3.57 7.14\n 32 \u2502 2.78 5.56\n 24 rows omitted\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/duckplyr_reprex/","title":"Reproduce a duckplyr example","text":"In this example, we will reproduce a DuckDB and duckplyr blog post example to demonstrate TidierDB's v0.5.0 capability.
The example by Hannes that is being reproduced is exploring Open Data from the New Zealand government that is ~ 1GB.
"},{"location":"examples/generated/UserGuide/duckplyr_reprex/#set-up","title":"Set up","text":"First we will set up the local duckdb database and pull in the metadata for the files. Notice we are not reading this data into memory, only the paths and and column, and table names. To follow along, copy the set up code below after downloading the data, but add the directory to the local data.
import TidierDB as DB\ndb = DB.connect(DB.duckdb())\n\ndir = \"/Downloads/nzcensus/\"\ndata = dir * \"Data8277.csv\"\nage = dir * \"DimenLookupAge8277.csv\"\narea = dir * \"DimenLookupArea8277.csv\"\nethnic = dir * \"DimenLookupEthnic8277.csv\"\nsex = dir * \"DimenLookupSex8277.csv\"\nyear = dir * \"DimenLookupYear8277.csv\"\n\ndata = DB.db_table(db, data);\nage = DB.db_table(db, age);\narea = DB.db_table(db, area);\nethnic = DB.db_table(db, ethnic);\nsex = DB.db_table(db, sex);\nyear = DB.db_table(db, year);\n
"},{"location":"examples/generated/UserGuide/duckplyr_reprex/#exploration","title":"Exploration","text":"While this long chain could be broken up into multiple smaller chains, lets reproduce the duckplyr code from example and demonstrate how TidierDB also supports multiple joins after filtering, mutating, etc the joining tables. 6 different tables are being joined together through sequential inner joins.
@chain DB.t(data) begin\n DB.@filter(str_detect(count, r\"^\\d+$\"))\n DB.@mutate(count_ = as_integer(count))\n DB.@filter(count_ > 0)\n DB.@inner_join(\n (@chain DB.t(age) begin\n DB.@filter(str_detect(Description, r\"^\\d+ years$\"))\n DB.@mutate(age_ = as_integer(str_remove(Code, \"years\"))) end),\n Age = Code\n )\n DB.@inner_join((@chain DB.t(year) DB.@mutate(year_ = Description)), year = Code)\n DB.@inner_join((@chain DB.t(area) begin\n DB.@mutate(area_ = Description)\n DB.@filter(!str_detect(area_, r\"^Total\"))\n end)\n , Area = Code)\n DB.@inner_join((@chain DB.t(ethnic) begin\n DB.@mutate(ethnic_ = Description)\n DB.@filter(!str_detect( ethnic_, r\"^Total\",)) end), Ethnic = Code)\n DB.@inner_join((@chain DB.t(sex) begin\n DB.@mutate(sex_ = Description)\n DB.@filter(!str_detect( sex_, r\"^Total\"))\n end)\n , Sex = Code)\n DB.@inner_join((@chain DB.t(year) DB.@mutate(year_ = Description)), Year = Code)\n @aside DB.@show_query _\n DB.@create_view(joined_up)\nend;\n\n@chain DB.db_table(db, \"joined_up\") begin\n DB.@filter begin\n age_ >= 20\n age_ <= 40\n str_detect(area_, r\"^Auckland\")\n year_ == \"2018\"\n ethnic_ != \"European\"\n end\n DB.@group_by sex_\n DB.@summarise(group_count = sum(count_))\n DB.@collect\nend\n
"},{"location":"examples/generated/UserGuide/duckplyr_reprex/#results","title":"Results","text":"When we collect this to a local dataframe, we can see that the results match the duckplyr/DuckDB example.
2\u00d72 DataFrame\n Row \u2502 sex_ group_count\n \u2502 String Int128\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Female 398556\n 2 \u2502 Male 397326\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/ex_joining/","title":"Joining Tables","text":"This page will illustrate how to join different tables in TidierDB. The examples will use the mtcars
dataset and a synthetic dataset called mt2
hosted on a personal MotherDuck instance. Examples will cover how to join tables with different schemas in different databases, and how to write queries on tables and then join them together, and how to do this by levaraging views.
using TidierDB\ndb = connect(duckdb(), \"md:\")\n\nmtcars = db_table(db, \"my_db.mtcars\")\nmt2 = db_table(db, \"ducks_db.mt2\")\n
"},{"location":"examples/generated/UserGuide/ex_joining/#wrangle-tables-and-self-join","title":"Wrangle tables and self join","text":"query = @chain t(mtcars) begin\n @group_by cyl\n @summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n @mutate begin\n efficiency = case_when(\n mpg_mean >= 25, \"High\",\n mpg_mean >= 15, \"Moderate\",\n \"Low\" )\n end\nend;\n\nquery2 = @chain t(mtcars) @filter(mpg>20) @mutate(mpg = mpg *4);\n\n@chain t(query) begin\n @left_join(t(query2), cyl, cyl)\n @group_by(efficiency)\n @summarize(avg_mean = mean(mpg))\n @mutate(mean = avg_mean / 4 )\n @aside @show_query _\n @collect\nend\n
2\u00d73 DataFrame\n Row \u2502 efficiency avg_mean mean\n \u2502 String Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 High 106.655 26.6636\n 2 \u2502 Moderate 84.5333 21.1333\n
"},{"location":"examples/generated/UserGuide/ex_joining/#different-schemas","title":"Different schemas","text":"To connect to a table in a different schema, prefix it with a dot. For example, \"schemaname.tablename\". In this query, we are also filtering out cars that contain \"M\" in the name from the mt2
table before joining.
other_db = @chain db_table(db, \"ducks_db.mt2\") @filter(!str_detect(car, \"M\"))\n@chain t(mtcars) begin\n @left_join(t(other_db), car, model)\n @select(car, model)\n @head(5)\n @collect\nend\n
5\u00d72 DataFrame\n Row \u2502 car model\n \u2502 String String\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Datsun 710 Datsun 710\n 2 \u2502 Hornet 4 Drive Hornet 4 Drive\n 3 \u2502 Hornet Sportabout Hornet Sportabout\n 4 \u2502 Valiant Valiant\n 5 \u2502 Duster 360 Duster 360\n
To join directly to the table, you can use the @left_join
macro with the table name as a string.
@chain t(mtcars) begin\n @left_join(\"ducks_db.mt2\", car, model)\n @select(car, model)\n @head(5)\n @collect\nend\n
5\u00d72 DataFrame\n Row \u2502 car model\n \u2502 String String\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 Mazda RX4\n 2 \u2502 Mazda RX4 Wag Mazda RX4 Wag\n 3 \u2502 Datsun 710 Datsun 710\n 4 \u2502 Hornet 4 Drive Hornet 4 Drive\n 5 \u2502 Hornet Sportabout Hornet Sportabout\n
"},{"location":"examples/generated/UserGuide/ex_joining/#using-a-view","title":"Using a View","text":"You can also use @create_view
to create views and then join them. This is an alternate reuse complex queries.
# notice, this is not begin saved, bc a view is created in the database at the end of the chain\n@chain t(mtcars) begin\n @group_by cyl\n @summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n @mutate begin\n efficiency = case_when(\n mpg_mean >= 25, \"High\",\n mpg_mean >= 15, \"Moderate\",\n \"Low\" )\n end\n #create a view in the database\n @create_view(viewer)\nend;\n\n# access the view like as if it was any other table\n@chain db_table(db, \"viewer\") begin\n @left_join(t(query2), cyl, cyl)\n @group_by(efficiency)\n @summarize(avg_mean = mean(mpg))\n @mutate(mean = avg_mean / 4 )\n @collect\nend\n
2\u00d73 DataFrame\n Row \u2502 efficiency avg_mean mean\n \u2502 String Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 High 106.655 26.6636\n 2 \u2502 Moderate 84.5333 21.1333\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/from_queryex/","title":"Reusing a Query (and Views)","text":"While using TidierDB, you may need to generate part of a query and reuse it multiple times. There are two ways to do this
from_query(query)
or its alias t(query)
@create_view(name)
import TidierDB as DB\ncon = DB.connect(duckdb())\nmtcars_path = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\nmtcars = DB.db_table(con, mtcars_path)\n
Start a query to analyze fuel efficiency by number of cylinders. However, to further build on this query later, end the chain without using @show_query
or @collect
query = DB.@chain DB.t(mtcars) begin\n DB.@group_by cyl\n DB.@summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n DB.@mutate begin\n efficiency = case_when(\n mpg_mean >= 25, \"High\",\n mpg_mean >= 15, \"Moderate\",\n \"Low\" )\n end\nend;\n
"},{"location":"examples/generated/UserGuide/from_queryex/#from_query-or-tquery","title":"from_query()
or t(query)
","text":"Now, from_query
, or t()
a convienece wrapper, will allow you to reuse the query to calculate the average horsepower for each efficiency category
DB.@chain DB.t(query) begin\n DB.@left_join(DB.t(mtcars), cyl = cyl)\n DB.@group_by(efficiency)\n DB.@summarize(avg_hp = mean(hp))\n DB.@collect\nend\n
2\u00d72 DataFrame\n Row \u2502 efficiency avg_hp\n \u2502 String? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Moderate 180.238\n 2 \u2502 High 82.6364\n
"},{"location":"examples/generated/UserGuide/from_queryex/#create_view","title":"@create_view","text":"This can also be done with @create_view
.
query2 = @chain t(mtcars) @filter(mpg>20) @mutate(mpg = mpg *4);\nDB.@chain DB.db_table(db, \"mtcars\") begin\n DB.@group_by cyl\n DB.@summarize begin\n across(mpg, (mean, minimum, maximum))\n num_cars = n()\n end\n DB.@mutate begin\n efficiency = case_when(\n mpg_mean >= 25, \"High\",\n mpg_mean >= 15, \"Moderate\",\n \"Low\" )\n end\n DB.@create_view(viewer)\n end;\n\n\nDB.@chain DB.db_table(db, \"viewer\") begin\n DB.@left_join(DB.t(query2), cyl = cyl)\n DB.@group_by(efficiency)\n DB.@summarize(avg_mean = mean(mpg))\n DB.@mutate(mean = avg_mean / 4 )\n @aside DB.@show_query _\n DB.@collect\nend\n2\u00d73 DataFrame\n Row \u2502 efficiency avg_mean mean\n \u2502 String Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 High 106.655 26.6636\n 2 \u2502 Moderate 84.5333 21.1333\n
"},{"location":"examples/generated/UserGuide/from_queryex/#preview-or-save-an-intermediate-table","title":"Preview or save an intermediate table","text":"While querying a dataset, you may wish to see an intermediate table, or even save it. You can use @aside
and from_query(_)
, illustrated below, to do just that. While we opted to print the results in this simple example below, we could have saved them by using name = DB.@chain...
import ClickHouse;\nconn = conn = DB.connect(DB.clickhouse(); host=\"localhost\", port=19000, database=\"default\", user=\"default\", password=\"\")\npath = \"https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet\"\nDB.@chain DB.db_table(conn, path) begin\n DB.@count(cyl)\n @aside println(DB.@chain DB.from_query(_) DB.@head(5) DB.@collect)\n DB.@arrange(desc(count))\n DB.@collect\nend\n
5\u00d72 DataFrame\n Row \u2502 artists count\n \u2502 String? UInt64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 missing 1\n 2 \u2502 Wizo 3\n 3 \u2502 MAGIC! 3\n 4 \u2502 Macaco 1\n 5 \u2502 SOYOU 1\n31438\u00d72 DataFrame\n Row \u2502 artists count\n \u2502 String? UInt64\n\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 The Beatles 279\n 2 \u2502 George Jones 271\n 3 \u2502 Stevie Wonder 236\n 4 \u2502 Linkin Park 224\n 5 \u2502 Ella Fitzgerald 222\n 6 \u2502 Prateek Kuhad 217\n 7 \u2502 Feid 202\n \u22ee \u2502 \u22ee \u22ee\n 31432 \u2502 Leonard 1\n 31433 \u2502 marcos g 1\n 31434 \u2502 BLVKSHP 1\n 31435 \u2502 Memtrix 1\n 31436 \u2502 SOYOU 1\n 31437 \u2502 Macaco 1\n 31438 \u2502 missing 1\n 31424 rows omitted\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/functions_pass_to_DB/","title":"Writing Functions with TidierDB Chains","text":"On this page, we'll briefly explore how to use TidierDB macros and $
witth @eval
to bulid a function
For a more indepth explanation, please check out the TidierData page on interpolation
using TidierDB, DataFrames;\n\ndb = connect(duckdb());\ndf = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9],\n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10],\n value = repeat(1:5, 2),\n percent = 0.1:0.1:1.0);\ncopy_to(db, df, \"dfm\");\ndf_mem = db_table(db, \"dfm\");\n
"},{"location":"examples/generated/UserGuide/functions_pass_to_DB/#interpolation","title":"Interpolation","text":"Variables are interpoated using @eval
and $
. Place @eval
before you begin the chain or call a TidierDb macro Why Use @eval? In Julia, macros like @filter are expanded at parse time, before runtime variables like vals are available. By using @eval, we force the expression to be evaluated at runtime, allowing us to interpolate the variable into the macro.
num = [3];\ncolumn = :id;\n@eval @chain t(df_mem) begin\n @filter(value in $num)\n @select($column)\n @collect\n end\n
2\u00d71 DataFrame RowidString1AC2AH "},{"location":"examples/generated/UserGuide/functions_pass_to_DB/#function-set-up","title":"Function set up","text":"Begin by defining your function as your normally would, but before @chain
you need to use @eval
. For the variables to be interpolated in need to be started with $
function test(vals, cols)\n @eval @chain t(df_mem) begin\n @filter(value in $vals)\n @select($cols)\n @collect\n end\nend;\n\nvals = [1, 2, 3, 3];\ntest(vals, [:groups, :value, :percent])\n
6\u00d73 DataFrame RowgroupsvaluepercentStringInt64Float641bb10.12aa10.63aa20.24bb20.75bb30.36aa30.8 Now with a new variable
other_vals = [1];\ncols = [:value, :percent];\ntest(other_vals, cols)\n
2\u00d72 DataFrame RowvaluepercentInt64Float64110.1210.6 Defineing a new function
function gs(groups, aggs, new_name, threshold)\n @eval @chain t(df_mem) begin\n @group_by($groups)\n @summarize($new_name = mean($aggs))\n @filter($new_name > $threshold)\n @collect\n end\nend;\n\ngs(:groups, :percent, :mean_percent, .5)\n
1\u00d72 DataFrame Rowgroupsmean_percentStringFloat641aa0.6 Change the column and threshold
gs(:groups, :value, :mean_value, 2)\n
2\u00d72 DataFrame Rowgroupsmean_valueStringFloat641bb3.02aa3.0 "},{"location":"examples/generated/UserGuide/functions_pass_to_DB/#write-pipeline-function-to-use-inside-of-chains","title":"Write pipeline function to use inside of chains","text":"Lets say there is a particular sequence of macros that you want repeatedly use. Wrap this series into a function that accepts a t(query
as its first argument and returns a SQLquery
and you can easily resuse it.
function moving_aggs(table, start, stop, group, order, col)\n qry = @eval @chain $table begin\n @group_by $group\n @window_frame $start $stop\n @window_order $order\n @mutate(across($col, (minimum, maximum, mean)))\n end\n return qry\nend;\n\n@chain t(df_mem) begin\n moving_aggs(-2, 1, :groups, :percent, :value)\n @filter value_mean > 2.75\n @aside @show_query _\n @collect\nend\n
6\u00d77 DataFrame Rowidgroupsvaluepercentvalue_minimumvalue_maximumvalue_meanStringStringInt64Float64Int64Int64Float641ABaa20.2243.02AHaa30.8153.253AJaa51.0153.04ACbb30.3153.05AGbb20.7253.56AIbb40.9253.66667 Filtering before the window functions
@chain t(df_mem) begin\n @filter(value >=2 )\n moving_aggs(-1, 1, :groups, :percent, :value)\n @aside @show_query _\n @collect\nend\n
8\u00d77 DataFrame Rowidgroupsvaluepercentvalue_minimumvalue_maximumvalue_meanStringStringInt64Float64Int64Int64Float641ABaa20.2243.02ADaa40.4243.03AHaa30.8354.04AJaa51.0354.05ACbb30.3354.06AEbb50.5253.333337AGbb20.7253.666678AIbb40.9243.0 "},{"location":"examples/generated/UserGuide/functions_pass_to_DB/#interpolating-queries","title":"Interpolating Queries","text":"To use a prior, uncollected TidierDB query in other TidierDB macros, interpolate the needed query without showing or collecting it
ok = @chain t(df_mem) @summarize(mean = mean(value));\n
The mean value represented in SQL from the above is 3
With @filter
@eval @chain t(df_mem) begin\n @filter( value > $ok)\n @collect\nend\n
4\u00d74 DataFrame RowidgroupsvaluepercentStringStringInt64Float641ADaa40.42AEbb50.53AIbb40.94AJaa51.0 With @mutate
@eval @chain t(df_mem) begin\n @mutate(value2 = value + $ok)\n @collect\nend\n
10\u00d75 DataFrame Rowidgroupsvaluepercentvalue2StringStringInt64Float64Float641AAbb10.14.02ABaa20.25.03ACbb30.36.04ADaa40.47.05AEbb50.58.06AFaa10.64.07AGbb20.75.08AHaa30.86.09AIbb40.97.010AJaa51.08.0 With @summarize
@eval @chain t(df_mem) begin\n @summarize(value = mean(value) * $ok)\n @collect\nend\n
1\u00d71 DataFrame RowvalueFloat6419.0 This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/getting_started/","title":"Getting Started","text":"To use TidierDB.jl, you will have to set up a connection. TidierDB.jl gives you access to duckdb via duckdb_open
and duckdb_connect
. However, to use MySql, ClickHouse, MSSQL, Postgres, or SQLite, you will have to load those packages in first.
If you plan to use TidierDB.jl with TidierData.jl or Tidier.jl, it is most convenenient to load the packages as follows:
using TidierData\nimport TidierDB as DB\n
Alternatively, using Tidier
will import TidierDB in the above manner for you, where TidierDB functions and macros will be available as DB.@mutate()
and so on, and the TidierData equivalent would be @mutate()
.
To connect to a database, you can uset the connect
function as shown below, or establish your own connection through the respecitve libraries.
For example Connecting to MySQL
conn = DB.connect(DB.mysql(); host=\"localhost\", user=\"root\", password=\"password\", db=\"mydb\")\n
versus connecting to DuckDB
conn = DB.connect(DB.duckdb())\n
"},{"location":"examples/generated/UserGuide/getting_started/#connect-to-a-local-database-file","title":"Connect to a local database file","text":"You can also connect to an existing database by passing the database file path as a string.
db = DB.connect(DB.duckdb(), \"mydb.duckdb\")\n
You can also establish any DuckDB connection through an alternate method that you prefer, and use that as your connection as well.
"},{"location":"examples/generated/UserGuide/getting_started/#package-extensions","title":"Package Extensions","text":"The following backends utilize package extensions. To use one of backends listed below, you will need to write using Library
import ClickHouse
using MySQL
using ODBC
using LibPQ
using SQLite
using AWS
using ODBC
using GoogleCloud
db_table
","text":"What does db_table
do?
db_table
starts the underlying SQL query struct, in addition to pulling the table metadata and storing it there. Storing metadata is what enables a lazy interface that also supports tidy selection.
db_table
has two required arguments: connection
and table
table
can be a table name on a database or a path/url to file to read. When passing db_table
a path or url, the table is not copied into memory.
db_table
only support direct file paths to a table. It does not support database file paths such as dbname.duckdb
or dbname.sqlite
. Such files must be used with connect
first.*
read in all files matching the pattern..csv
in the given folder.db_table(db, \"folder/path/*.csv\")\n
db_table
also supports iceberg, delta, and S3 file paths via DuckDB.
If you are working with a backend where compute cost is important, it will be important to minimize using db_table
as this will requery for metadata each time. Compute costs are relevant to backends such as AWS, databricks and Snowflake.
To do this, save the results of db_table
and use them with t
. Using t
pulls the relevant information (metadata, con, etc) from the mutable SQLquery struct, allowing you to repeatedly query and collect the table without requerying for the metadata each time
!Tip: t()
is an alias for from_query
This means after saving the results of db_table
, use t(table)
to refer to the table or prior query
table = DB.db_table(con, \"path\")\n@chain DB.t(table) begin\n ## data wrangling here\nend\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/ibis_comp/","title":"TidierDB.jl vs Ibis","text":""},{"location":"examples/generated/UserGuide/ibis_comp/#comparing-tidierdb-vs-ibis","title":"Comparing TidierDB vs Ibis","text":"TidierDB is a reimplementation of dbplyr from R, so the syntax is remarkably similar. But how does TidierDB compare to Python's Ibis? This page will perform a similar comparison to the Ibis Documentation comparing Ibis and dplyr
"},{"location":"examples/generated/UserGuide/ibis_comp/#set-up","title":"Set up","text":"Ibis
import ibis\nimport ibis.selectors as s # allows for different styles of column selection\nfrom ibis import _ # eliminates need to type table name before each column vs typing cols as strings\nibis.options.interactive = True # automatically collects first 10 rows of table\n\ncon = ibis.connect(\"duckdb://\")\n
TidierDB
using TidierDB\ndb = connect(duckdb())\n
Of note, TidierDB does not yet have an \"interactive mode\" so each example result will be collected.
"},{"location":"examples/generated/UserGuide/ibis_comp/#loading-data","title":"Loading Data","text":"With Ibis, there are specific functions to read in different file types
mtcars = con.read_csv(\"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\")\n
In TidierDB, there is only db_table
, which determines the file type and generates the syntax appropriate for the backend in use.
mtcars = db_table(db, \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\");\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#previewing-the-data","title":"Previewing the data","text":"TidierDB and Ibis use head
/@head
to preview the first rows of a dataset.
Ibis
mtcars.head(6)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 mpg \u2503 cyl \u2503 disp \u2503 hp \u2503 drat \u2503 wt \u2503 qsec \u2503 vs \u2503 am \u2503 gear \u2503 carb \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502 int64 \u2502 float64 \u2502 int64 \u2502 float64 \u2502 float64 \u2502 float64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Mazda RX4 \u2502 21.0 \u2502 6 \u2502 160.0 \u2502 110 \u2502 3.90 \u2502 2.620 \u2502 16.46 \u2502 0 \u2502 1 \u2502 4 \u2502 4 \u2502\n\u2502 Mazda RX4 Wag \u2502 21.0 \u2502 6 \u2502 160.0 \u2502 110 \u2502 3.90 \u2502 2.875 \u2502 17.02 \u2502 0 \u2502 1 \u2502 4 \u2502 4 \u2502\n\u2502 Datsun 710 \u2502 22.8 \u2502 4 \u2502 108.0 \u2502 93 \u2502 3.85 \u2502 2.320 \u2502 18.61 \u2502 1 \u2502 1 \u2502 4 \u2502 1 \u2502\n\u2502 Hornet 4 Drive \u2502 21.4 \u2502 6 \u2502 258.0 \u2502 110 \u2502 3.08 \u2502 3.215 \u2502 19.44 \u2502 1 \u2502 0 \u2502 3 \u2502 1 \u2502\n\u2502 Hornet Sportabout \u2502 18.7 \u2502 8 \u2502 360.0 \u2502 175 \u2502 3.15 \u2502 3.440 \u2502 17.02 \u2502 0 \u2502 0 \u2502 3 \u2502 2 \u2502\n\u2502 Valiant \u2502 18.1 \u2502 6 \u2502 225.0 \u2502 105 \u2502 2.76 \u2502 3.460 \u2502 20.22 \u2502 1 \u2502 0 \u2502 3 \u2502 1 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB
@chain t(mtcars) @head(6) @collect\n
6\u00d712 DataFrame\n Row \u2502 model mpg cyl disp hp drat wt qsec vs am gear carb\n \u2502 String? Float64? Int64? Float64? Int64? Float64? Float64? Float64? Int64? Int64? Int64? Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4\n 2 \u2502 Mazda RX4 Wag 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4\n 3 \u2502 Datsun 710 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1\n 4 \u2502 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1\n 5 \u2502 Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2\n 6 \u2502 Valiant 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#filtering","title":"Filtering","text":"The example below demonstrates how to filter using multiple criteria in both Ibis and TidierData Ibis
mtcars.filter(((_.mpg > 22) & (_.drat > 4) | (_.hp == 113)))\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 mpg \u2503 cyl \u2503 disp \u2503 hp \u2503 drat \u2503 wt \u2503 qsec \u2503 vs \u2503 am \u2503 gear \u2503 carb \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502 int64 \u2502 float64 \u2502 int64 \u2502 float64 \u2502 float64 \u2502 float64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Lotus Europa \u2502 30.4 \u2502 4 \u2502 95.1 \u2502 113 \u2502 3.77 \u2502 1.513 \u2502 16.90 \u2502 1 \u2502 1 \u2502 5 \u2502 2 \u2502\n\u2502 Fiat 128 \u2502 32.4 \u2502 4 \u2502 78.7 \u2502 66 \u2502 4.08 \u2502 2.200 \u2502 19.47 \u2502 1 \u2502 1 \u2502 4 \u2502 1 \u2502\n\u2502 Honda Civic \u2502 30.4 \u2502 4 \u2502 75.7 \u2502 52 \u2502 4.93 \u2502 1.615 \u2502 18.52 \u2502 1 \u2502 1 \u2502 4 \u2502 2 \u2502\n\u2502 Toyota Corolla \u2502 33.9 \u2502 4 \u2502 71.1 \u2502 65 \u2502 4.22 \u2502 1.835 \u2502 19.90 \u2502 1 \u2502 1 \u2502 4 \u2502 1 \u2502\n\u2502 Fiat X1-9 \u2502 27.3 \u2502 4 \u2502 79.0 \u2502 66 \u2502 4.08 \u2502 1.935 \u2502 18.90 \u2502 1 \u2502 1 \u2502 4 \u2502 1 \u2502\n\u2502 Porsche 914-2 \u2502 26.0 \u2502 4 \u2502 120.3 \u2502 91 \u2502 4.43 \u2502 2.140 \u2502 16.70 \u2502 0 \u2502 1 \u2502 5 \u2502 2 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB
@chain t(mtcars) begin\n @filter((mpg > 22 && drat > 4) || hp == 113)\n @collect\nend\n
6\u00d712 DataFrame\n Row \u2502 model mpg cyl disp hp drat wt qsec vs am gear carb\n \u2502 String? Float64? Int64? Float64? Int64? Float64? Float64? Float64? Int64? Int64? Int64? Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2\n 2 \u2502 Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1\n 3 \u2502 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n 4 \u2502 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1\n 5 \u2502 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.9 1 1 4 1\n 6 \u2502 Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#creating-new-columns","title":"Creating new columns","text":"Both TidierDB and Ibis use mutate
/@mutate
to add new columns
Ibis
(\n mtcars\n .mutate(kpg = _.mpg * 1.61)\n .select(\"model\", \"kpg\")\n)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 kpg \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Mazda RX4 \u2502 33.810 \u2502\n\u2502 Mazda RX4 Wag \u2502 33.810 \u2502\n\u2502 Datsun 710 \u2502 36.708 \u2502\n\u2502 Hornet 4 Drive \u2502 34.454 \u2502\n\u2502 Hornet Sportabout \u2502 30.107 \u2502\n\u2502 Valiant \u2502 29.141 \u2502\n\u2502 Duster 360 \u2502 23.023 \u2502\n\u2502 Merc 240D \u2502 39.284 \u2502\n\u2502 Merc 230 \u2502 36.708 \u2502\n\u2502 Merc 280 \u2502 30.912 \u2502\n\u2502 \u2026 \u2502 \u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB
@chain t(mtcars) begin\n @mutate(kpg = mpg * 1.61)\n @select(model, kpg)\n @collect\nend\n
32\u00d72 DataFrame\n Row \u2502 model kpg\n \u2502 String? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 33.81\n 2 \u2502 Mazda RX4 Wag 33.81\n 3 \u2502 Datsun 710 36.708\n 4 \u2502 Hornet 4 Drive 34.454\n 5 \u2502 Hornet Sportabout 30.107\n 6 \u2502 Valiant 29.141\n \u22ee \u2502 \u22ee \u22ee\n 27 \u2502 Porsche 914-2 41.86\n 28 \u2502 Lotus Europa 48.944\n 29 \u2502 Ford Pantera L 25.438\n 30 \u2502 Ferrari Dino 31.717\n 31 \u2502 Maserati Bora 24.15\n 32 \u2502 Volvo 142E 34.454\n 20 rows omitted\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#sorting-columns","title":"Sorting columns","text":"Ibis uses order_by
similar to SQLs ORDER BY
Ibis
mtcars.order_by(_.mpg)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 mpg \u2503 cyl \u2503 disp \u2503 hp \u2503 drat \u2503 wt \u2503 qsec \u2503 vs \u2503 am \u2503 gear \u2503 carb \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502 int64 \u2502 float64 \u2502 int64 \u2502 float64 \u2502 float64 \u2502 float64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502 int64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Cadillac Fleetwood \u2502 10.4 \u2502 8 \u2502 472.0 \u2502 205 \u2502 2.93 \u2502 5.250 \u2502 17.98 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Lincoln Continental \u2502 10.4 \u2502 8 \u2502 460.0 \u2502 215 \u2502 3.00 \u2502 5.424 \u2502 17.82 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Camaro Z28 \u2502 13.3 \u2502 8 \u2502 350.0 \u2502 245 \u2502 3.73 \u2502 3.840 \u2502 15.41 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Duster 360 \u2502 14.3 \u2502 8 \u2502 360.0 \u2502 245 \u2502 3.21 \u2502 3.570 \u2502 15.84 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Chrysler Imperial \u2502 14.7 \u2502 8 \u2502 440.0 \u2502 230 \u2502 3.23 \u2502 5.345 \u2502 17.42 \u2502 0 \u2502 0 \u2502 3 \u2502 4 \u2502\n\u2502 Maserati Bora \u2502 15.0 \u2502 8 \u2502 301.0 \u2502 335 \u2502 3.54 \u2502 3.570 \u2502 14.60 \u2502 0 \u2502 1 \u2502 5 \u2502 8 \u2502\n\u2502 Merc 450SLC \u2502 15.2 \u2502 8 \u2502 275.8 \u2502 180 \u2502 3.07 \u2502 3.780 \u2502 18.00 \u2502 0 \u2502 0 \u2502 3 \u2502 3 \u2502\n\u2502 AMC Javelin \u2502 15.2 \u2502 8 \u2502 304.0 \u2502 150 \u2502 3.15 \u2502 3.435 \u2502 17.30 \u2502 0 \u2502 0 \u2502 3 \u2502 2 \u2502\n\u2502 Dodge Challenger \u2502 15.5 \u2502 8 \u2502 318.0 \u2502 150 \u2502 2.76 \u2502 3.520 \u2502 16.87 \u2502 0 \u2502 0 \u2502 3 \u2502 2 \u2502\n\u2502 Ford Pantera L \u2502 15.8 \u2502 8 \u2502 351.0 \u2502 264 \u2502 4.22 \u2502 3.170 \u2502 14.50 \u2502 0 \u2502 1 \u2502 5 \u2502 4 \u2502\n\u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
While TidierDB uses @arrange
like TidierData.jl
TidierDB
@chain t(mtcars) @arrange(mpg) @collect\n
32\u00d712 DataFrame\n Row \u2502 model mpg cyl disp hp drat wt qsec vs am gear carb\n \u2502 String? Float64? Int64? Float64? Int64? Float64? Float64? Float64? Int64? Int64? Int64? Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.25 17.98 0 0 3 4\n 2 \u2502 Lincoln Continental 10.4 8 460.0 215 3.0 5.424 17.82 0 0 3 4\n 3 \u2502 Camaro Z28 13.3 8 350.0 245 3.73 3.84 15.41 0 0 3 4\n 4 \u2502 Duster 360 14.3 8 360.0 245 3.21 3.57 15.84 0 0 3 4\n 5 \u2502 Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4\n 6 \u2502 Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8\n \u22ee \u2502 \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee \u22ee\n 27 \u2502 Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2\n 28 \u2502 Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.9 1 1 4 1\n 29 \u2502 Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2\n 30 \u2502 Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2\n 31 \u2502 Fiat 128 32.4 4 78.7 66 4.08 2.2 19.47 1 1 4 1\n 32 \u2502 Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.9 1 1 4 1\n 20 rows omitted\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#selecting-columns","title":"Selecting columns","text":"In Ibis, columns must be prefixed with the table name, or in this case _
, or they can be given as a string. Finally to using helper functions like startswith
requires importing selectors as above.
Ibis
mtcars.select(s.startswith(\"m\"), \"drat\", _.wt)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 model \u2503 mpg \u2503 drat \u2503 wt \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502 float64 \u2502 float64 \u2502 float64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Mazda RX4 \u2502 21.0 \u2502 3.90 \u2502 2.620 \u2502\n\u2502 Mazda RX4 Wag \u2502 21.0 \u2502 3.90 \u2502 2.875 \u2502\n\u2502 Datsun 710 \u2502 22.8 \u2502 3.85 \u2502 2.320 \u2502\n\u2502 Hornet 4 Drive \u2502 21.4 \u2502 3.08 \u2502 3.215 \u2502\n\u2502 Hornet Sportabout \u2502 18.7 \u2502 3.15 \u2502 3.440 \u2502\n\u2502 Valiant \u2502 18.1 \u2502 2.76 \u2502 3.460 \u2502\n\u2502 Duster 360 \u2502 14.3 \u2502 3.21 \u2502 3.570 \u2502\n\u2502 Merc 240D \u2502 24.4 \u2502 3.69 \u2502 3.190 \u2502\n\u2502 Merc 230 \u2502 22.8 \u2502 3.92 \u2502 3.150 \u2502\n\u2502 Merc 280 \u2502 19.2 \u2502 3.92 \u2502 3.440 \u2502\n\u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502 \u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB does not require names to be prefixed and, like TidierData, tidy column selection with starts_with
, ends_with
, and contains
is supported at base. TidierDB also supports providing column names as strings, although this would only be needed in the setting of renaming a column with a space in it.
TidierDB
@chain t(mtcars) @select(starts_with(\"m\"), \"drat\", wt) @collect\n
32\u00d74 DataFrame\n Row \u2502 model mpg drat wt\n \u2502 String? Float64? Float64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 21.0 3.9 2.62\n 2 \u2502 Mazda RX4 Wag 21.0 3.9 2.875\n 3 \u2502 Datsun 710 22.8 3.85 2.32\n 4 \u2502 Hornet 4 Drive 21.4 3.08 3.215\n 5 \u2502 Hornet Sportabout 18.7 3.15 3.44\n 6 \u2502 Valiant 18.1 2.76 3.46\n \u22ee \u2502 \u22ee \u22ee \u22ee \u22ee\n 27 \u2502 Porsche 914-2 26.0 4.43 2.14\n 28 \u2502 Lotus Europa 30.4 3.77 1.513\n 29 \u2502 Ford Pantera L 15.8 4.22 3.17\n 30 \u2502 Ferrari Dino 19.7 3.62 2.77\n 31 \u2502 Maserati Bora 15.0 3.54 3.57\n 32 \u2502 Volvo 142E 21.4 4.11 2.78\n 20 rows omitted\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#multi-step-queries-and-summarizing","title":"Multi step queries and summarizing","text":"Aggregating data is done with aggregate
in Ibis and @summarize
in TidierDB. To group data, both utilze group_by
/@group_by
Ibis
mtcars.group_by(._cyl).aggregate(\n total_hp=_.hp.sum(),\n avg_hp=_.hp.mean()\n).filter(_.total_hp < 1000)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2533\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 cyl \u2503 total_hp \u2503 avg_hp \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2547\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 int64 \u2502 int64 \u2502 float64 \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 6 \u2502 856 \u2502 122.285714 \u2502\n\u2502 4 \u2502 909 \u2502 82.636364 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
In TidierDB, @filter
will automatically determine whether the criteria belong in a WHERE
or HAVING
SQL clause.
TidierDB
@chain t(mtcars) begin\n @group_by(cyl)\n @summarize(total_hp = sum(hp),\n avg_hp = avg(hp))\n @filter(total_hp < 1000)\n @collect\nend\n
2\u00d73 DataFrame\n Row \u2502 cyl total_hp avg_hp\n \u2502 Int64? Int128? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 6 856 122.286\n 2 \u2502 4 909 82.6364\n
"},{"location":"examples/generated/UserGuide/ibis_comp/#renaming-columns","title":"Renaming columns","text":"Both tools use rename
/@rename to rename columns
Ibis
mtcars.rename(make_model = \"model\").select(_.make_model)\n
\u250f\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2513\n\u2503 make_model \u2503\n\u2521\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2529\n\u2502 string \u2502\n\u251c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2524\n\u2502 Mazda RX4 \u2502\n\u2502 Mazda RX4 Wag \u2502\n\u2502 Datsun 710 \u2502\n\u2502 Hornet 4 Drive \u2502\n\u2502 Hornet Sportabout \u2502\n\u2502 Valiant \u2502\n\u2502 Duster 360 \u2502\n\u2502 Merc 240D \u2502\n\u2502 Merc 230 \u2502\n\u2502 Merc 280 \u2502\n\u2502 \u2026 \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
TidierDB
@chain t(mtcars) @rename(model_make = model) @select(model_make) @collect\n
32\u00d71 DataFrame\n Row \u2502 model_make\n \u2502 String?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4\n 2 \u2502 Mazda RX4 Wag\n 3 \u2502 Datsun 710\n 4 \u2502 Hornet 4 Drive\n 5 \u2502 Hornet Sportabout\n 6 \u2502 Valiant\n \u22ee \u2502 \u22ee\n 27 \u2502 Porsche 914-2\n 28 \u2502 Lotus Europa\n 29 \u2502 Ford Pantera L\n 30 \u2502 Ferrari Dino\n 31 \u2502 Maserati Bora\n 32 \u2502 Volvo 142E\n 20 rows omitted\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/key_differences/","title":"Key Differences from TidierData.jl","text":"There are a few important syntax and behavior differences between TidierDB.jl and TidierData.jl outlined below.
"},{"location":"examples/generated/UserGuide/key_differences/#creating-a-database","title":"Creating a database","text":"For these examples we will use DuckDB, the default backend, although SQLite, Postgres, MySQL, MariaDB, MSSQL, and ClickHouse are possible. If you have an existing DuckDB connection, then this step is not required. For these examples, we will create a data frame and copy it to an in-memory DuckDB database.
using DataFrames, TidierDB\n\ndf = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9],\n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10],\n value = repeat(1:5, 2),\n percent = 0.1:0.1:1.0);\n\ndb = connect(duckdb());\n\ncopy_to(db, df, \"df_mem\"); # copying over the data frame to an in-memory database\n
"},{"location":"examples/generated/UserGuide/key_differences/#row-ordering","title":"Row ordering","text":"DuckDB benefits from aggressive parallelization of pipelines. This means that if you have multiple threads enabled in Julia, which you can check or set using Threads.nthreads()
, DuckDB will use multiple threads. However, because many operations are multi-threaded, the resulting row order is inconsistent. If row order needs to be deterministic for your use case, make sure to apply an @arrange(column_name_1, column_name_2, etc...)
prior to collecting the results.
When using TidierDB, db_table(connection, :table_name)
is used to start a chain.
In TidierDB, when performing @group_by
then @mutate
, the table will be ungrouped after applying all of the mutations in the clause to the grouped data. To perform subsequent grouped operations, the user would have to regroup the data. This is demonstrated below.
@chain db_table(db, :df_mem) begin\n @group_by(groups)\n @summarize(mean_percent = mean(percent))\n @collect\n end\n
2\u00d72 DataFrame Rowgroupsmean_percentStringFloat641bb0.52aa0.6 Regrouping following @mutate
@chain db_table(db, :df_mem) begin\n @group_by(groups)\n @mutate(max = maximum(percent), min = minimum(percent))\n @group_by(groups)\n @summarise(mean_percent = mean(percent))\n @collect\nend\n
2\u00d72 DataFrame Rowgroupsmean_percentStringFloat641aa0.62bb0.5 "},{"location":"examples/generated/UserGuide/key_differences/#differences-in-case_when","title":"Differences in case_when()
","text":"In TidierDB, after the clause is completed, the result for the new column should is separated by a comma ,
in contrast to TidierData.jl, where the result for the new column is separated by a =>
.
@chain db_table(db, :df_mem) begin\n @mutate(new_col = case_when(percent > .5, \"Pass\", # in TidierData, percent > .5 => \"Pass\",\n percent <= .5, \"Try Again\", # percent <= .5 => \"Try Again\"\n true, \"middle\"))\n @collect\n end\n
10\u00d75 DataFrame Rowidgroupsvaluepercentnew_colStringStringInt64Float64String1AAbb10.1Try Again2ABaa20.2Try Again3ACbb30.3Try Again4ADaa40.4Try Again5AEbb50.5Try Again6AFaa10.6Pass7AGbb20.7Pass8AHaa30.8Pass9AIbb40.9Pass10AJaa51.0Pass "},{"location":"examples/generated/UserGuide/key_differences/#joining-tables","title":"Joining Tables","text":"When joining a table, the column from both tables will be present, in contrast to TidierData which will keep one column
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/outofmemex/","title":"Working With Larger than RAM Datasets","text":"While using the DuckDB backend, TidierDB's lazy intferace enables querying datasets larger than your available RAM.
To illustrate this, we will recreate the Hugging Face x Polars example. The final table results are shown below and in this Hugging Face x DuckDB example
First we will load TidierDB, set up a local database and then set the URLs for the 2 training datasets from huggingface.co
using TidierDB\ndb = connect(duckdb())\n\nurls = [\"https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet\",\n \"https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0001.parquet\"];\n
Here, we pass the vector of URLs to db_table
, which will not copy them into memory. Since these datasets are so large, we will also set stream = true
in @collect
to stream the results. If we wanted to read all the files in the folder we could have replace the 0000
with *
(wildcard) db_table(db, \"Path/to/folder/*.parquet\")
Of note, reading these files from URLs is not as rapid as reading them from local files.
@chain db_table(db, urls) begin\n @group_by(horoscope)\n @summarise(count = n(), avg_blog_length = mean(length(text)))\n @arrange(desc(count))\n @aside @show_query _\n @collect(stream = true)\nend\n
Placing @aside @show_query _
before @collect
above lets us see the SQL query and collect it to a local DataFrame at the same time.
SELECT horoscope, COUNT(*) AS count, AVG(length(text)) AS avg_blog_length\n FROM read_parquet(['https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet', 'https://huggingface.co/datasets/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0001.parquet'])\n GROUP BY horoscope\n ORDER BY avg_blog_length DESC\n12\u00d73 DataFrame\n Row \u2502 horoscope count avg_blog_length\n \u2502 String? Int64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Aquarius 49568 1125.83\n 2 \u2502 Cancer 63512 1097.96\n 3 \u2502 Libra 60304 1060.61\n 4 \u2502 Capricorn 49402 1059.56\n 5 \u2502 Sagittarius 50431 1057.46\n 6 \u2502 Leo 58010 1049.6\n 7 \u2502 Taurus 61571 1022.69\n 8 \u2502 Gemini 52925 1020.26\n 9 \u2502 Scorpio 56495 1014.03\n 10 \u2502 Pisces 53812 1011.75\n 11 \u2502 Virgo 64629 996.684\n 12 \u2502 Aries 69134 918.081\n
To learn more about memory efficient queries on larger than RAM files, this blog from DuckDB will help maximize your local db
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/s3viaduckdb/","title":"S3 + DuckDB + TidierDB","text":"TidierDB allows you leverage DuckDB's seamless database integration.
Using DuckDB, you can connect to an AWS or GoogleCloud Database to query directly without making any local copies.
You can also use DBInterface.execute
to set up any DuckDB database connection you need and then use that db to query with TidierDB
using TidierDB\n\n#Connect to Google Cloud via DuckDB\n#google_db = connect(duckdb(), :gbq, access_key=\"string\", secret_key=\"string\")\n\n#Connect to AWS via DuckDB\naws_db = connect(duckdb(), :aws, aws_access_key_id= \"string\",\n aws_secret_access_key= \"string\",\n aws_region=\"us-east-1\")\ns3_csv_path = \"s3://path/to_data.csv\"\n\n@chain db_table(aws_db, s3_csv_path) begin\n @filter(!starts_with(column1, \"M\"))\n @group_by(cyl)\n @summarize(mpg = mean(mpg))\n @mutate(mpg_squared = mpg^2,\n mpg_rounded = round(mpg),\n mpg_efficiency = case_when(\n mpg >= cyl^2 , \"efficient\",\n mpg < 15.2 , \"inefficient\",\n \"moderate\"))\n @filter(mpg_efficiency in (\"moderate\", \"efficient\"))\n @arrange(desc(mpg_rounded))\n @collect\nend\n
2\u00d75 DataFrame\n Row \u2502 cyl mpg mpg_squared mpg_rounded mpg_efficiency\n \u2502 Int64? Float64? Float64? Float64? String?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 27.3444 747.719 27.0 efficient\n 2 \u2502 6 19.7333 389.404 20.0 moderate\n
This page was generated using Literate.jl.
"},{"location":"examples/generated/UserGuide/udfs_ex/","title":"Flexible Syntax and UDFs","text":"TidierDB is unique in its statement parsing flexiblility. This means that using any built in SQL function or user defined functions (or UDFS) or is readily avaialable. To use any function built into a database in @mutate
or in @summarize
, simply correctly write the correctly, but replace '
with \"
. This also applies to any UDF. The example below will illustrate UDFs in the context of DuckDB.
# Set up the connection\nusing TidierDB #rexports DuckDB\ndb = DuckDB.DB()\ncon = DuckDB.connect(db) # this will be important for UDFs\nmtcars_path = \"https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv\"\nmtcars = db_tbable(con, mtcars_path);\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#aggregate-function-in-summarize","title":"aggregate function in @summarize
","text":"Lets use the DuckDB kurtosis
aggregate function
@chain t(mtcars) begin\n @group_by cyl\n @summarize(kurt = kurtosis(mpg))\n @collect\nend\n3\u00d72 DataFrame\n Row \u2502 cyl kurt\n \u2502 Int64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 4 -1.43411\n 2 \u2502 6 -1.82944\n 3 \u2502 8 0.330061\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#aggregate-functions-in-mutate","title":"aggregate functions in @mutate
","text":"To aggregate sql functions that are builtin to any database, but exist outside of the TidierDB parser, simply wrap the function call in agg()
@chain t(mtcars) begin\n @group_by(cyl)\n @mutate(kurt = agg(kurtosis(mpg)))\n @select cyl mpg kurt\n @collect\nend\n\n32\u00d73 DataFrame\n Row \u2502 cyl mpg kurt\n \u2502 Int64? Float64? Float64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 8 18.7 0.330061\n 2 \u2502 8 14.3 0.330061\n 3 \u2502 8 16.4 0.330061\n 4 \u2502 8 17.3 0.330061\n 5 \u2502 8 15.2 0.330061\n 6 \u2502 8 10.4 0.330061\n 7 \u2502 8 10.4 0.330061\n \u22ee \u2502 \u22ee \u22ee \u22ee\n 27 \u2502 6 21.0 -1.82944\n 28 \u2502 6 21.4 -1.82944\n 29 \u2502 6 18.1 -1.82944\n 30 \u2502 6 19.2 -1.82944\n 31 \u2502 6 17.8 -1.82944\n 32 \u2502 6 19.7 -1.82944\n 19 rows omitted\nend\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#duckdb-function-chaining","title":"DuckDB function chaining","text":"In DuckDB, functions can be chained together with .
. TidierDB lets you leverage this.
@chain t(mtcars) begin\n @mutate(model2 = model.upper().string_split(\" \").list_aggr(\"string_agg\",\".\").concat(\".\"))\n @select model model2\n @collect\nend\n32\u00d72 DataFrame\n Row \u2502 model model2\n \u2502 String? String?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Mazda RX4 MAZDA.RX4.\n 2 \u2502 Mazda RX4 Wag MAZDA.RX4.WAG.\n 3 \u2502 Datsun 710 DATSUN.710.\n 4 \u2502 Hornet 4 Drive HORNET.4.DRIVE.\n 5 \u2502 Hornet Sportabout HORNET.SPORTABOUT.\n 6 \u2502 Valiant VALIANT.\n 7 \u2502 Duster 360 DUSTER.360.\n \u22ee \u2502 \u22ee \u22ee\n 27 \u2502 Porsche 914-2 PORSCHE.914-2.\n 28 \u2502 Lotus Europa LOTUS.EUROPA.\n 29 \u2502 Ford Pantera L FORD.PANTERA.L.\n 30 \u2502 Ferrari Dino FERRARI.DINO.\n 31 \u2502 Maserati Bora MASERATI.BORA.\n 32 \u2502 Volvo 142E VOLVO.142E.\n 19 rows omitted\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#rowid-and-pseudocolumns","title":"rowid
and pseudocolumns","text":"When a table is not being read directly from a file, rowid
is avaialable for use. In general, TidierDB should support all pseudocolumns.
copy_to(db, mtcars_path, \"mtcars\"); # copying table in for demostration purposes\n@chain db_table(con, :mtcars) begin\n @filter(rowid == 4)\n @select(model:hp)\n @collect\nend\n1\u00d75 DataFrame\n Row \u2502 model mpg cyl disp hp\n \u2502 String? Float64? Int64? Float64? Int64?\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 Hornet Sportabout 18.7 8 360.0 175\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#udf-sqlite-example","title":"UDF SQLite Example","text":"using SQLite\nsql = connect(sqlite());\ndf = DataFrame(id = [string('A' + i \u00f7 26, 'A' + i % 26) for i in 0:9],\n groups = [i % 2 == 0 ? \"aa\" : \"bb\" for i in 1:10],\n value = repeat(1:5, 2),\n percent = 0.1:0.1:1.0);\n\ncopy_to(db, sql, \"df_mem\");\nSQLite.@register sql function diff_of_squares(x, y)\n x^2 - y^2\n end;\n\n@chain db_table(sql, \"df_mem\") begin\n @select(value, percent)\n @mutate(plus3 = diff_of_squares(value, percent))\n @collect\nend\n10\u00d73 DataFrame\n Row \u2502 value percent plus3\n \u2502 Int64 Float64 Float64\n\u2500\u2500\u2500\u2500\u2500\u253c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n 1 \u2502 1 0.1 0.99\n 2 \u2502 2 0.2 3.96\n 3 \u2502 3 0.3 8.91\n 4 \u2502 4 0.4 15.84\n 5 \u2502 5 0.5 24.75\n 6 \u2502 1 0.6 0.64\n 7 \u2502 2 0.7 3.51\n 8 \u2502 3 0.8 8.36\n 9 \u2502 4 0.9 15.19\n 10 \u2502 5 1.0 24.0\n
"},{"location":"examples/generated/UserGuide/udfs_ex/#how-to-create-udf-in-duckdb","title":"How to create UDF in DuckDB","text":"Example coming soon..
This page was generated using Literate.jl.
"}]} \ No newline at end of file diff --git a/latest/sitemap.xml.gz b/latest/sitemap.xml.gz index a72535d..482a3ee 100644 Binary files a/latest/sitemap.xml.gz and b/latest/sitemap.xml.gz differ