diff --git a/benchmark/tpcds/README.md b/benchmark/tpcds/README.md index f13587b2eb568..3dcc95bac46f3 100644 --- a/benchmark/tpcds/README.md +++ b/benchmark/tpcds/README.md @@ -2,27 +2,12 @@ ## Preparing the Table and Data -We use [DuckDB](https://github.com/duckdb/duckdb) to generate TPC-DS data. +We use [DuckDB](https://duckdb.org/docs/installation/) to generate TPC-DS data. After installing DuckDB, you can use these commands to generate the data ([more information](https://github.com/duckdb/duckdb/tree/master/extension/tpcds)): -```sql -INSTALL tpcds; -LOAD tpcds; -SELECT * FROM dsdgen(sf=0.01) -- sf can be other values, such as 0.1, 1, 10, ... -EXPORT DATABASE '/tmp/tpcds_0_01/' (FORMAT CSV, DELIMITER '|'); -``` - -Then, move the data to current directory: - -```shell -mv /tmp/tpcds_0_01/ "$(pwd)/data/" -``` - -After that, you can load data to Databend: - ```shell -./load_data.sh +./load_data.sh 0.1 ``` ## Benchmark @@ -32,5 +17,5 @@ To run the TPC-DS Benchmark, first build `databend-sqllogictests` binary. Then, execute the following command in your shell: ```shell -databend-sqllogictests --handlers mysql --database tpcds --run_dir tpcds --bench +databend-sqllogictests --handlers mysql --database tpcds --run_dir tpcds --bench ``` \ No newline at end of file diff --git a/benchmark/tpcds/load_data.sh b/benchmark/tpcds/load_data.sh index 310cd3db3443a..852fd714c7d52 100755 --- a/benchmark/tpcds/load_data.sh +++ b/benchmark/tpcds/load_data.sh @@ -3,33 +3,44 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/shell_env.sh +factor=$1 + +echo """ +INSTALL tpcds; +LOAD tpcds; +SELECT * FROM dsdgen(sf=$factor); -- sf can be other values, such as 0.1, 1, 10, ... +EXPORT DATABASE '/tmp/tpcds_$factor/' (FORMAT CSV, DELIMITER '|'); +""" | duckdb + +mv /tmp/tpcds_$factor/ "$(pwd)/data/" + # Create Database echo "CREATE DATABASE IF NOT EXISTS ${MYSQL_DATABASE}" | $BENDSQL_CLIENT_CONNECT_DEFAULT tables=( - call_center - catalog_returns - customer_address - customer_demographics - household_demographics - inventory - promotion - ship_mode - store_returns - time_dim - web_page + call_center + catalog_returns + customer_address + customer_demographics + household_demographics + inventory + promotion + ship_mode + store_returns + time_dim + web_page web_sales - catalog_page - catalog_sales - customer - date_dim - income_band - item - reason - store - store_sales - warehouse - web_returns + catalog_page + catalog_sales + customer + date_dim + income_band + item + reason + store + store_sales + warehouse + web_returns web_site ) @@ -43,11 +54,12 @@ done cat "$CURDIR"/tpcds.sql | $BENDSQL_CLIENT_CONNECT # Load Data +# note: export STORAGE_ALLOW_INSECURE=true to start databend-query for t in ${tables[@]} do echo "$t" - insert_sql="insert into $MYSQL_DATABASE.$t file_format = (type = CSV skip_header = 0 field_delimiter = '|' record_delimiter = '\n')" - curl -s -u root: -XPUT "http://localhost:8000/v1/streaming_load" -H "database: tpcds" -H "insert_sql: ${insert_sql}" -F 'upload=@"'${CURDIR}'/data/'$t'.csv"' > /dev/null 2>&1 + fp="`pwd`/data/$t.csv" + echo "copy into ${MYSQL_DATABASE}.$t from 'fs://${fp}' file_format = (type = CSV skip_header = 1 field_delimiter = '|' record_delimiter = '\n')" | $BENDSQL_CLIENT_CONNECT done diff --git a/benchmark/tpch/README.md b/benchmark/tpch/README.md index f473386bb472e..27f253130fd3a 100644 --- a/benchmark/tpch/README.md +++ b/benchmark/tpch/README.md @@ -3,6 +3,7 @@ ## Preparing the Table and Data +We use [DuckDB](https://duckdb.org/docs/installation/) to generate TPCH data. To prepare the table and data for the TPC-H Benchmark, run the following command in your shell: ```shell diff --git a/benchmark/tpch/gen_data.sh b/benchmark/tpch/gen_data.sh deleted file mode 100644 index e78c0a76fc6cb..0000000000000 --- a/benchmark/tpch/gen_data.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash - -rm -rf ./data/* -docker pull ghcr.io/databloom-ai/tpch-docker:main -docker run -it -v "$(pwd)/data":/data ghcr.io/databloom-ai/tpch-docker:main dbgen -vf -s $1 \ No newline at end of file diff --git a/benchmark/tpch/prepare_table.sh b/benchmark/tpch/load_data.sh similarity index 92% rename from benchmark/tpch/prepare_table.sh rename to benchmark/tpch/load_data.sh index a9a5511f6bef9..70a7cd33dcc6b 100644 --- a/benchmark/tpch/prepare_table.sh +++ b/benchmark/tpch/load_data.sh @@ -111,9 +111,10 @@ echo "CREATE TABLE IF NOT EXISTS lineitem ) CLUSTER BY(l_shipdate, l_orderkey) ${options}" | $BENDSQL_CLIENT_CONNECT # insert data to tables +# note: export STORAGE_ALLOW_INSECURE=true to start databend-query for t in customer lineitem nation orders partsupp part region supplier do echo "$t" - insert_sql="insert into ${MYSQL_DATABASE}.$t file_format = (type = CSV skip_header = 0 field_delimiter = '|' record_delimiter = '\n')" - curl -s -u root: -XPUT "http://localhost:${QUERY_HTTP_HANDLER_PORT}/v1/streaming_load" -H "database: tpch" -H "insert_sql: ${insert_sql}" -F 'upload=@"./data/'$t'.tbl"' + fp="`pwd`/data/$t.tbl" + echo "copy into ${MYSQL_DATABASE}.$t from 'fs://${fp}' file_format = (type = CSV skip_header = 1 field_delimiter = '|' record_delimiter = '\n')" | $BENDSQL_CLIENT_CONNECT done diff --git a/benchmark/tpch/tpch.sh b/benchmark/tpch/tpch.sh index 7f66fd8836f3e..2d76a0b10d266 100755 --- a/benchmark/tpch/tpch.sh +++ b/benchmark/tpch/tpch.sh @@ -1,13 +1,20 @@ #!/usr/bin/env bash -# generate tpch data -sh ./gen_data.sh $1 + +echo """ +INSTALL tpch; +LOAD tpch; +SELECT * FROM dsdgen(sf=1); -- sf can be other values, such as 0.1, 1, 10, ... +EXPORT DATABASE '/tmp/tpch_1/' (FORMAT CSV, DELIMITER '|'); +""" | duckdb + +mv /tmp/tpch_1/ "$(pwd)/data/" if [[ $2 == native ]]; then echo "native" - sh ./prepare_table.sh "storage_format = 'native' compression = 'lz4'" + sh ./load_data.sh "storage_format = 'native' compression = 'lz4'" else echo "fuse" - sh ./prepare_table.sh "" + sh ./load_data.sh "" fi