diff --git a/storage/duckdb/cmake/duckdb.cmake b/storage/duckdb/cmake/duckdb.cmake index 1cd6fca5bac2e..82c6ab143c9a3 100644 --- a/storage/duckdb/cmake/duckdb.cmake +++ b/storage/duckdb/cmake/duckdb.cmake @@ -74,6 +74,11 @@ ExternalProject_Add(duckdb_build -DBUILD_TPCE=OFF -DEXTENSION_STATIC_BUILD=1 "-DDUCKDB_EXTENSION_CONFIGS=${CMAKE_CURRENT_SOURCE_DIR}/cmake/duckdb_extensions.cmake" + # Upstream sets DUCKDB_EXTENSION_JEMALLOC_LINKED via add_extension_definitions(), + # which runs in extension/ but NOT in src/, so allocator.cpp (in duckdb_static) + # compiles the glibc malloc() path even though libjemalloc_extension.a is linked. + # Define it globally + add the jemalloc header dir so the USE_JEMALLOC branch is active. + "-DCMAKE_CXX_FLAGS=-DDUCKDB_EXTENSION_JEMALLOC_LINKED=1 -I${DUCKDB_SUBMODULE_DIR}/extension/jemalloc/include" -DENABLE_SANITIZER=FALSE -DENABLE_UBSAN=OFF -DOVERRIDE_GIT_DESCRIBE=v1.5.2-0-g0000000000 diff --git a/storage/duckdb/cmake/duckdb_extensions.cmake b/storage/duckdb/cmake/duckdb_extensions.cmake index b831f89cc0a5d..ff6134187b953 100644 --- a/storage/duckdb/cmake/duckdb_extensions.cmake +++ b/storage/duckdb/cmake/duckdb_extensions.cmake @@ -1,6 +1,7 @@ # Extensions required by the DuckDB storage engine plugin for MariaDB. # This config is passed to DuckDB via DUCKDB_EXTENSION_CONFIGS. +duckdb_extension_load(jemalloc) duckdb_extension_load(core_functions) duckdb_extension_load(icu) duckdb_extension_load(json) diff --git a/storage/duckdb/runtime/duckdb_mysql_compat.cc b/storage/duckdb/runtime/duckdb_mysql_compat.cc index af6e5afe7fc03..c22ac0d31e196 100644 --- a/storage/duckdb/runtime/duckdb_mysql_compat.cc +++ b/storage/duckdb/runtime/duckdb_mysql_compat.cc @@ -54,6 +54,9 @@ #include "duckdb/main/connection.hpp" #include "duckdb/common/types/string_type.hpp" +#include "duckdb/execution/expression_executor.hpp" +#include "duckdb/function/scalar/regexp.hpp" +#include "duckdb/planner/expression/bound_function_expression.hpp" #include "re2/re2.h" namespace myduck @@ -982,6 +985,86 @@ static void locate_3arg_func(duckdb::DataChunk &args, }); } +/* ================================================================ + regexp_replace(VARCHAR, VARCHAR, VARCHAR) -> VARCHAR + + MariaDB REGEXP_REPLACE replaces ALL matches (global), unlike DuckDB's + native 3-arg form which replaces only the first. We reuse DuckDB's + native bind-data / local-state so a constant pattern is compiled once + at bind time (RegexInitLocalState) instead of per row. + + Invalid-pattern behavior mirrors MariaDB: + - constant pattern -> RegexLocalState ctor throws (query error); + - non-constant -> per-row NULL. + ================================================================ */ + +static duckdb::unique_ptr +regexp_replace_bind(duckdb::ClientContext &context, + duckdb::ScalarFunction &, + duckdb::vector> + &arguments) +{ + auto data= duckdb::make_uniq(); + data->constant_pattern= duckdb::regexp_util::TryParseConstantPattern( + context, *arguments[1], data->constant_string); + data->global_replace= true; + data->options.set_log_errors(false); + return duckdb::unique_ptr(std::move(data)); +} + +static void regexp_replace_global_func(duckdb::DataChunk &args, + duckdb::ExpressionState &state, + duckdb::Vector &result) +{ + auto &func_expr= state.expr.Cast(); + auto &info= func_expr.bind_info->Cast(); + + auto &strings= args.data[0]; + auto &patterns= args.data[1]; + auto &replaces= args.data[2]; + + if (info.constant_pattern) + { + auto &lstate= duckdb::ExecuteFunctionState::GetFunctionState(state) + ->Cast(); + duckdb::BinaryExecutor::Execute( + strings, replaces, result, args.size(), + [&](duckdb::string_t input, duckdb::string_t replace) { + std::string s= input.GetString(); + duckdb_re2::RE2::GlobalReplace( + &s, lstate.constant_pattern, + duckdb_re2::StringPiece(replace.GetData(), replace.GetSize())); + return duckdb::StringVector::AddString(result, s); + }); + } + else + { + duckdb::TernaryExecutor::ExecuteWithNulls( + strings, patterns, replaces, result, args.size(), + [&](duckdb::string_t input, duckdb::string_t pattern, + duckdb::string_t replace, duckdb::ValidityMask &mask, + duckdb::idx_t idx) -> duckdb::string_t { + duckdb_re2::RE2 re( + duckdb_re2::StringPiece(pattern.GetData(), pattern.GetSize()), + info.options); + if (!re.ok()) + { + mask.SetInvalid(idx); + return duckdb::string_t(); + } + std::string s= input.GetString(); + duckdb_re2::RE2::GlobalReplace( + &s, re, + duckdb_re2::StringPiece(replace.GetData(), replace.GetSize())); + return duckdb::StringVector::AddString(result, s); + }); + } +} + /* ================================================================ Registration ================================================================ */ @@ -1207,32 +1290,15 @@ void register_mysql_compat_functions(duckdb::DatabaseInstance &db) } /* regexp_replace(VARCHAR, VARCHAR, VARCHAR) → VARCHAR - Replaces all occurrences of pattern in expr with replacement. */ + Global (replace-all) MariaDB semantics with bind-time pattern + compilation for constant patterns. See regexp_replace_global_func. */ { duckdb::ScalarFunctionSet set("regexp_replace"); set.AddFunction(duckdb::ScalarFunction( {duckdb::LogicalType::VARCHAR, duckdb::LogicalType::VARCHAR, duckdb::LogicalType::VARCHAR}, - duckdb::LogicalType::VARCHAR, - [](duckdb::DataChunk &args, duckdb::ExpressionState &, - duckdb::Vector &result) { - duckdb::TernaryExecutor::Execute( - args.data[0], args.data[1], args.data[2], result, args.size(), - [&](duckdb::string_t expr, duckdb::string_t pat, - duckdb::string_t repl) -> duckdb::string_t { - duckdb_re2::RE2 re( - duckdb_re2::StringPiece(pat.GetData(), pat.GetSize())); - if (!re.ok()) - return expr; - std::string s(expr.GetData(), expr.GetSize()); - duckdb_re2::RE2::GlobalReplace( - &s, re, - duckdb_re2::StringPiece(repl.GetData(), repl.GetSize())); - return duckdb::StringVector::AddString(result, s); - }); - })); + duckdb::LogicalType::VARCHAR, regexp_replace_global_func, + regexp_replace_bind, nullptr, nullptr, duckdb::RegexInitLocalState)); duckdb::CreateScalarFunctionInfo info(std::move(set)); info.on_conflict= duckdb::OnCreateConflict::ALTER_ON_CONFLICT; catalog.CreateFunction(transaction, info); diff --git a/storage/duckdb/tpch/02_generate.sh b/storage/duckdb/tpch/02_generate.sh index 265c8d2a83510..138fd6d5db84f 100755 --- a/storage/duckdb/tpch/02_generate.sh +++ b/storage/duckdb/tpch/02_generate.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Generate TPC-H data (.tbl, pipe-delimited) at scale factor $SF into $DATA_DIR. -# Skips generation if all .tbl files already exist (set FORCE=1 to regenerate). +# Generate TPC-H data (Parquet) at scale factor $SF into $DATA_DIR. +# Skips generation if all .parquet files already exist (set FORCE=1 to regenerate). set -euo pipefail DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$DIR/config.sh" @@ -10,12 +10,12 @@ command -v tpchgen-cli >/dev/null 2>&1 || { echo "ERROR: run ./01_install.sh fir mkdir -p "$DATA_DIR" missing=0 -for t in "${TABLES[@]}"; do [ -f "$DATA_DIR/$t.tbl" ] || missing=1; done +for t in "${TABLES[@]}"; do [ -f "$DATA_DIR/$t.parquet" ] || missing=1; done if [ "$missing" = 0 ] && [ "${FORCE:-0}" != 1 ]; then - echo "All .tbl files already present in $DATA_DIR (set FORCE=1 to regenerate)." + echo "All .parquet files already present in $DATA_DIR (set FORCE=1 to regenerate)." exit 0 fi -echo "Generating TPC-H SF$SF (.tbl) into $DATA_DIR ..." -tpchgen-cli -s "$SF" --output-dir "$DATA_DIR" -ls -la "$DATA_DIR"/*.tbl +echo "Generating TPC-H SF$SF (Parquet) into $DATA_DIR ..." +tpchgen-cli -s "$SF" --format=parquet --output-dir "$DATA_DIR" +ls -la "$DATA_DIR"/*.parquet diff --git a/storage/duckdb/tpch/03_schema.sh b/storage/duckdb/tpch/03_schema.sh index 7689af55449b9..bfffa4a61a7a6 100755 --- a/storage/duckdb/tpch/03_schema.sh +++ b/storage/duckdb/tpch/03_schema.sh @@ -1,18 +1,23 @@ #!/usr/bin/env bash -# Create schema $SCHEMA and the 8 TPC-H tables inside the embedded DuckDB, -# via run_in_duckdb. CREATE OR REPLACE makes this idempotent. +# Create database $SCHEMA and the 8 TPC-H tables as ENGINE=DUCKDB, directly +# through the mariadb client. DROP + CREATE makes this idempotent. set -euo pipefail DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$DIR/config.sh" -echo "Creating schema '$SCHEMA' and tables ..." -duck "CREATE SCHEMA IF NOT EXISTS $SCHEMA" >/dev/null -duck "CREATE OR REPLACE TABLE $SCHEMA.region (r_regionkey INTEGER PRIMARY KEY, r_name VARCHAR, r_comment VARCHAR)" >/dev/null -duck "CREATE OR REPLACE TABLE $SCHEMA.nation (n_nationkey INTEGER PRIMARY KEY, n_name VARCHAR, n_regionkey INTEGER, n_comment VARCHAR)" >/dev/null -duck "CREATE OR REPLACE TABLE $SCHEMA.supplier (s_suppkey INTEGER PRIMARY KEY, s_name VARCHAR, s_address VARCHAR, s_nationkey INTEGER, s_phone VARCHAR, s_acctbal DECIMAL(15,2), s_comment VARCHAR)" >/dev/null -duck "CREATE OR REPLACE TABLE $SCHEMA.customer (c_custkey INTEGER PRIMARY KEY, c_name VARCHAR, c_address VARCHAR, c_nationkey INTEGER, c_phone VARCHAR, c_acctbal DECIMAL(15,2), c_mktsegment VARCHAR, c_comment VARCHAR)" >/dev/null -duck "CREATE OR REPLACE TABLE $SCHEMA.part (p_partkey INTEGER PRIMARY KEY, p_name VARCHAR, p_mfgr VARCHAR, p_brand VARCHAR, p_type VARCHAR, p_size INTEGER, p_container VARCHAR, p_retailprice DECIMAL(15,2), p_comment VARCHAR)" >/dev/null -duck "CREATE OR REPLACE TABLE $SCHEMA.partsupp (ps_partkey INTEGER, ps_suppkey INTEGER, ps_availqty INTEGER, ps_supplycost DECIMAL(15,2), ps_comment VARCHAR, PRIMARY KEY (ps_partkey, ps_suppkey))" >/dev/null -duck "CREATE OR REPLACE TABLE $SCHEMA.orders (o_orderkey BIGINT PRIMARY KEY, o_custkey INTEGER, o_orderstatus VARCHAR, o_totalprice DECIMAL(15,2), o_orderdate DATE, o_orderpriority VARCHAR, o_clerk VARCHAR, o_shippriority INTEGER, o_comment VARCHAR)" >/dev/null -duck "CREATE OR REPLACE TABLE $SCHEMA.lineitem (l_orderkey BIGINT, l_partkey INTEGER, l_suppkey INTEGER, l_linenumber INTEGER, l_quantity DECIMAL(15,2), l_extendedprice DECIMAL(15,2), l_discount DECIMAL(15,2), l_tax DECIMAL(15,2), l_returnflag VARCHAR, l_linestatus VARCHAR, l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE, l_shipinstruct VARCHAR, l_shipmode VARCHAR, l_comment VARCHAR, PRIMARY KEY (l_orderkey, l_linenumber))" >/dev/null -echo "Schema '$SCHEMA' ready." +echo "Creating database '$SCHEMA' and ENGINE=DUCKDB tables ..." +mdb "CREATE DATABASE IF NOT EXISTS $SCHEMA" + +for t in "${TABLES[@]}"; do + mdb_db "DROP TABLE IF EXISTS $t" +done + +mdb_db "CREATE TABLE region (r_regionkey INTEGER PRIMARY KEY, r_name VARCHAR(25), r_comment VARCHAR(152)) ENGINE=DUCKDB DEFAULT CHARSET=$CHARSET" +mdb_db "CREATE TABLE nation (n_nationkey INTEGER PRIMARY KEY, n_name VARCHAR(25), n_regionkey INTEGER, n_comment VARCHAR(152)) ENGINE=DUCKDB DEFAULT CHARSET=$CHARSET" +mdb_db "CREATE TABLE supplier (s_suppkey INTEGER PRIMARY KEY, s_name VARCHAR(25), s_address VARCHAR(40), s_nationkey INTEGER, s_phone VARCHAR(15), s_acctbal DECIMAL(15,2), s_comment VARCHAR(101)) ENGINE=DUCKDB DEFAULT CHARSET=$CHARSET" +mdb_db "CREATE TABLE customer (c_custkey INTEGER PRIMARY KEY, c_name VARCHAR(25), c_address VARCHAR(40), c_nationkey INTEGER, c_phone VARCHAR(15), c_acctbal DECIMAL(15,2), c_mktsegment VARCHAR(10), c_comment VARCHAR(117)) ENGINE=DUCKDB DEFAULT CHARSET=$CHARSET" +mdb_db "CREATE TABLE part (p_partkey INTEGER PRIMARY KEY, p_name VARCHAR(55), p_mfgr VARCHAR(25), p_brand VARCHAR(10), p_type VARCHAR(25), p_size INTEGER, p_container VARCHAR(10), p_retailprice DECIMAL(15,2), p_comment VARCHAR(23)) ENGINE=DUCKDB DEFAULT CHARSET=$CHARSET" +mdb_db "CREATE TABLE partsupp (ps_partkey INTEGER, ps_suppkey INTEGER, ps_availqty INTEGER, ps_supplycost DECIMAL(15,2), ps_comment VARCHAR(199), PRIMARY KEY (ps_partkey, ps_suppkey)) ENGINE=DUCKDB DEFAULT CHARSET=$CHARSET" +mdb_db "CREATE TABLE orders (o_orderkey BIGINT PRIMARY KEY, o_custkey INTEGER, o_orderstatus CHAR(1), o_totalprice DECIMAL(15,2), o_orderdate DATE, o_orderpriority VARCHAR(15), o_clerk VARCHAR(15), o_shippriority INTEGER, o_comment VARCHAR(79)) ENGINE=DUCKDB DEFAULT CHARSET=$CHARSET" +mdb_db "CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey INTEGER, l_suppkey INTEGER, l_linenumber INTEGER, l_quantity DECIMAL(15,2), l_extendedprice DECIMAL(15,2), l_discount DECIMAL(15,2), l_tax DECIMAL(15,2), l_returnflag CHAR(1), l_linestatus CHAR(1), l_shipdate DATE, l_commitdate DATE, l_receiptdate DATE, l_shipinstruct VARCHAR(25), l_shipmode VARCHAR(10), l_comment VARCHAR(44), PRIMARY KEY (l_orderkey, l_linenumber)) ENGINE=DUCKDB DEFAULT CHARSET=$CHARSET" +echo "Database '$SCHEMA' ready." diff --git a/storage/duckdb/tpch/04_load.sh b/storage/duckdb/tpch/04_load.sh index 0d880cee5eb6c..7ef6617056190 100755 --- a/storage/duckdb/tpch/04_load.sh +++ b/storage/duckdb/tpch/04_load.sh @@ -1,19 +1,22 @@ #!/usr/bin/env bash -# Populate $SCHEMA.* with COPY from the generated .tbl files (DuckDB reads the -# pipe-delimited, header-less, trailing-'|' tbl format with DELIMITER '|'). +# Populate $SCHEMA.* from the generated Parquet files. Loading runs on the +# embedded DuckDB via run_in_duckdb (the duck helper) with read_parquet(): +# the ENGINE=DUCKDB tables created in step 3 are addressable inside DuckDB as +# ., so INSERT ... SELECT * FROM read_parquet() fills them +# server-side without round-tripping the data through the MariaDB client. # Times each table and prints row counts. set -euo pipefail DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$DIR/config.sh" -echo "== COPY load from $DATA_DIR into schema '$SCHEMA' (wall clock incl. client) ==" +echo "== read_parquet load from $DATA_DIR into database '$SCHEMA' (wall clock incl. client) ==" total=0 for t in "${TABLES[@]}"; do - f="$DATA_DIR/$t.tbl" + f="$DATA_DIR/$t.parquet" [ -f "$f" ] || { echo "ERROR: missing $f (run ./02_generate.sh)" >&2; exit 1; } duck "TRUNCATE $SCHEMA.$t" >/dev/null 2>&1 || true start=$(date +%s.%N) - duck "COPY $SCHEMA.$t FROM '$f' (DELIMITER '|')" >/dev/null + duck "INSERT INTO $SCHEMA.$t SELECT * FROM read_parquet('$f')" >/dev/null end=$(date +%s.%N) total=$(awk -v a="$total" -v s="$start" -v e="$end" 'BEGIN{print a+(e-s)}') awk -v s="$start" -v e="$end" -v t="$t" 'BEGIN{printf "%-10s %9.3f s\n", t, e-s}' @@ -23,5 +26,5 @@ awk -v a="$total" 'BEGIN{printf "%-10s %9.3f s\n", "TOTAL", a}' echo "== row counts ==" for t in "${TABLES[@]}"; do printf "%-10s " "$t" - duck "SELECT count(*) FROM $SCHEMA.$t" | grep -Eo '^[0-9]+$' | tail -1 + mdb_db "SELECT count(*) FROM $t" | grep -Eo '^[0-9]+$' | tail -1 done diff --git a/storage/duckdb/tpch/05_run_queries.sh b/storage/duckdb/tpch/05_run_queries.sh index f411701a6b966..c3b98c46ebf55 100755 --- a/storage/duckdb/tpch/05_run_queries.sh +++ b/storage/duckdb/tpch/05_run_queries.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -# Run the 22 TPC-H queries from $TPCH_SQL against $SCHEMA via run_in_duckdb, -# timing each (wall clock, one client invocation). Writes a TSV of timings. -# Queries get the raw MariaDB-dialect text (no pushdown rewrites): any query -# using MariaDB-only syntax is reported as ERR. +# Run the 22 TPC-H queries from $TPCH_SQL against database $SCHEMA directly +# through the mariadb client (no run_in_duckdb), timing each (wall clock, one +# client invocation). Writes a TSV of timings. Any query that errors out +# (e.g. unsupported syntax) is reported as ERR. set -uo pipefail DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$DIR/config.sh" @@ -21,11 +21,9 @@ for i in $(seq 1 22); do n=$(printf "%02d" "$i") f="$MQ/q$n.sql" [ -f "$f" ] || continue - combined="SET schema '$SCHEMA'; $(cat "$f")" - esc=$(printf '%s' "$combined" | sed "s/'/''/g") start=$(date +%s.%N) - out=$("$MARIADB" -N -e "SELECT run_in_duckdb('$esc')" 2>&1) + out=$("$MARIADB" --default-character-set="$CHARSET" -N -D "$SCHEMA" < "$f" 2>&1) end=$(date +%s.%N) if printf '%s' "$out" | grep -qiE 'error'; then diff --git a/storage/duckdb/tpch/README.md b/storage/duckdb/tpch/README.md index f930b3ecc8b36..2b137a7010908 100644 --- a/storage/duckdb/tpch/README.md +++ b/storage/duckdb/tpch/README.md @@ -1,17 +1,17 @@ # TPC-H kit — DuckDB storage engine (MariaDB) -Reproducible TPC-H pipeline for the embedded DuckDB engine: install a generator, -generate data, create the schema, COPY-load it, and run the 22 queries — all -through MariaDB's `run_in_duckdb`. +Reproducible TPC-H pipeline for the DuckDB storage engine: install a generator, +generate Parquet data, create the `ENGINE=DUCKDB` tables, load them with +`read_parquet()`, and run the 22 queries directly through the `mariadb` client. ## Pipeline | Step | Script | What it does | |---|---|---| | 1 | `01_install.sh` | Install `tpchgen-cli` (pip / uv / cargo). | -| 2 | `02_generate.sh` | Generate `.tbl` data at scale factor `$SF` into `$DATA_DIR`. | -| 3 | `03_schema.sh` | Create schema `$SCHEMA` + 8 tables in the embedded DuckDB. | -| 4 | `04_load.sh` | `COPY` each `.tbl` into `$SCHEMA.*`; prints per-table timings + row counts. | +| 2 | `02_generate.sh` | Generate Parquet data at scale factor `$SF` into `$DATA_DIR`. | +| 3 | `03_schema.sh` | Create database `$SCHEMA` + 8 `ENGINE=DUCKDB` tables. | +| 4 | `04_load.sh` | `INSERT ... SELECT * FROM read_parquet()` each `.parquet` into `$SCHEMA.*` (via `run_in_duckdb`); prints per-table timings + row counts. | | 5 | `05_run_queries.sh` | Run the 22 queries from `$TPCH_SQL`; writes `query_timings.tsv`. | Run everything: `./run_all.sh` (steps are idempotent; generation is skipped if data exists). @@ -23,33 +23,35 @@ All knobs live in `config.sh` and are overridable via environment: ```bash SF=1 ./run_all.sh # scale factor 1 DATA_DIR=/data/tpch SF=10 ./run_all.sh # custom data location -SCHEMA=tpch_bench ./03_schema.sh # custom DuckDB schema +SCHEMA=tpch_bench ./03_schema.sh # custom MariaDB database ``` | Var | Default | Meaning | |---|---|---| | `SF` | `10` | TPC-H scale factor | -| `DATA_DIR` | `/git/tpch/sf` | where `.tbl` files are generated/read | -| `SCHEMA` | `bench` | DuckDB schema populated via the UDF | +| `DATA_DIR` | `/git/tpch/sf` | where `.parquet` files are generated/read | +| `SCHEMA` | `bench` | MariaDB database holding the `ENGINE=DUCKDB` tables | | `TPCH_SQL` | `/tpch.sql` | source of the 22 (MariaDB-dialect) queries | | `MARIADB` | `mariadb` | client command | ## Prerequisites -- A running MariaDB server with the DuckDB engine loaded and the - `run_in_duckdb` function available. +- A running MariaDB server with the DuckDB storage engine loaded + (`ENGINE=DUCKDB` available) and the `run_in_duckdb` function installed + (used only for the Parquet load). - `pip`, `uv`, or `cargo` to install the generator; `tpchgen-cli` on `PATH` afterwards (pip user installs land in `~/.local/bin`). ## How it works / caveats -- **Generator:** `tpchgen-cli -s --output-dir ` emits classic - `.tbl` files (pipe-delimited, no header, trailing `|`). DuckDB loads these with - `COPY ... (DELIMITER '|')` — the trailing delimiter is tolerated. -- **Load target:** data goes into a DuckDB-native schema (`bench`) inside the - embedded instance via `run_in_duckdb`, not into `ENGINE=DUCKDB` MariaDB - tables (which can't be `COPY`-loaded). -- **Queries:** taken from `/tpch.sql` (MariaDB dialect) and executed as - `SET schema ''; ` through the UDF. The UDF receives the **raw** - text — the engine's dialect rewrites only apply on pushdown, so any - MariaDB-only syntax errors out and is reported as `ERR` in the timings. +- **Generator:** `tpchgen-cli -s --format=parquet --output-dir ` + emits one `.parquet` file per table. +- **Load target:** data goes into `ENGINE=DUCKDB` tables in a regular MariaDB + database (`bench`). The load runs server-side on the embedded DuckDB via + `run_in_duckdb`: `INSERT INTO .
SELECT * FROM read_parquet(...)`. + ENGINE=DUCKDB tables are addressable inside DuckDB as `.
`, so the + Parquet data never round-trips through the MariaDB client. The TPC-H Parquet + column order/types match the table definitions, so a plain `SELECT *` works. +- **Queries:** taken from `$TPCH_SQL` (MariaDB dialect) and executed directly + through the `mariadb` client with `$SCHEMA` as the default database. Any + query that errors out is reported as `ERR` in the timings. diff --git a/storage/duckdb/tpch/config.sh b/storage/duckdb/tpch/config.sh index ebe296201c15a..a4d55f4fa66c5 100755 --- a/storage/duckdb/tpch/config.sh +++ b/storage/duckdb/tpch/config.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash # Shared configuration for the TPC-H DuckDB-engine benchmark kit. +# Every statement runs directly through the mariadb client against +# ENGINE=DUCKDB tables (no run_in_duckdb UDF). # Override any value via environment, e.g. SF=1 ./04_load.sh # -# SF TPC-H scale factor (default 10) +# SF TPC-H scale factor (default 10) # DATA_DIR where .tbl files are generated/loaded from (default /git/tpch/sf) -# SCHEMA DuckDB schema populated via run_in_duckdb (default bench) +# SCHEMA MariaDB database holding the ENGINE=DUCKDB tables (default bench) # TPCH_SQL MariaDB-dialect query file (source of the 22 queries) # MARIADB mariadb client command @@ -18,11 +20,29 @@ MARIADB="${MARIADB:-mariadb}" TABLES=(region nation supplier customer part partsupp orders lineitem) +# The DuckDB engine only accepts utf8mb3/utf8mb4/ascii column charsets, so the +# client connection must use utf8mb4 (otherwise CREATE TABLE inherits the +# server's default, e.g. latin1, and fails with "non-utf8 charset"). +CHARSET="${CHARSET:-utf8mb4}" + +# Run one SQL statement directly through the mariadb client, server-wide +# (no default database). Use for CREATE DATABASE and other global DDL. +mdb() { + "$MARIADB" --default-character-set="$CHARSET" -N -e "$1" +} + +# Run one SQL statement directly through the mariadb client within $SCHEMA. +mdb_db() { + "$MARIADB" --default-character-set="$CHARSET" -N -D "$SCHEMA" -e "$1" +} + # Run one statement on the embedded DuckDB through MariaDB's run_in_duckdb. # Single quotes in $1 are escaped so SQL string/identifier literals survive. +# Used for DuckDB-native SQL (e.g. read_parquet()) against ENGINE=DUCKDB tables, +# which are addressable inside DuckDB as .
. # Prints the UDF result text (callers discard it when not needed). duck() { local esc esc=$(printf '%s' "$1" | sed "s/'/''/g") - "$MARIADB" -N -e "SELECT run_in_duckdb('$esc')" + "$MARIADB" --default-character-set="$CHARSET" -N -e "SELECT run_in_duckdb('$esc')" }