Skip to content

Commit 8441fa1

Browse files
authored
Merge branch 'main' into add-linter-rule
2 parents 6bc5523 + d41c3e0 commit 8441fa1

File tree

274 files changed

+10346
-5382
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

274 files changed

+10346
-5382
lines changed

.circleci/install-prerequisites.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,9 @@ echo "Installing OS-level dependencies: $ALL_DEPENDENCIES"
3434

3535
sudo apt-get clean && sudo apt-get -y update && sudo ACCEPT_EULA='Y' apt-get -y install $ALL_DEPENDENCIES
3636

37+
if [ "$ENGINE" == "spark" ]; then
38+
echo "Using Java version for spark:"
39+
java -version
40+
fi
41+
3742
echo "All done"

.circleci/manage-test-db.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ databricks_init() {
5151

5252
# Note: the cluster doesnt need to be running to create / drop catalogs, but it does need to be running to run the integration tests
5353
echo "Ensuring cluster is running"
54-
databricks clusters start $CLUSTER_ID
54+
# the || true is to prevent the following error from causing an abort:
55+
# > Error: is in unexpected state Running.
56+
databricks clusters start $CLUSTER_ID || true
5557
}
5658

5759
databricks_up() {

Makefile

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ dbt-test:
138138
pytest -n auto -m "dbt and not cicdonly"
139139

140140
dbt-fast-test:
141-
pytest -n auto -m "dbt and fast" --retries 3
141+
pytest -n auto -m "dbt and fast" --reruns 3
142142

143143
github-test:
144144
pytest -n auto -m "github"
@@ -173,58 +173,58 @@ engine-%-down:
173173
##################
174174

175175
clickhouse-test: engine-clickhouse-up
176-
pytest -n auto -m "clickhouse" --retries 3 --junitxml=test-results/junit-clickhouse.xml
176+
pytest -n auto -m "clickhouse" --reruns 3 --junitxml=test-results/junit-clickhouse.xml
177177

178178
duckdb-test: engine-duckdb-install
179-
pytest -n auto -m "duckdb" --retries 3 --junitxml=test-results/junit-duckdb.xml
179+
pytest -n auto -m "duckdb" --reruns 3 --junitxml=test-results/junit-duckdb.xml
180180

181181
mssql-test: engine-mssql-up
182-
pytest -n auto -m "mssql" --retries 3 --junitxml=test-results/junit-mssql.xml
182+
pytest -n auto -m "mssql" --reruns 3 --junitxml=test-results/junit-mssql.xml
183183

184184
mysql-test: engine-mysql-up
185-
pytest -n auto -m "mysql" --retries 3 --junitxml=test-results/junit-mysql.xml
185+
pytest -n auto -m "mysql" --reruns 3 --junitxml=test-results/junit-mysql.xml
186186

187187
postgres-test: engine-postgres-up
188-
pytest -n auto -m "postgres" --retries 3 --junitxml=test-results/junit-postgres.xml
188+
pytest -n auto -m "postgres" --reruns 3 --junitxml=test-results/junit-postgres.xml
189189

190190
spark-test: engine-spark-up
191-
pytest -n auto -m "spark" --retries 3 --junitxml=test-results/junit-spark.xml
191+
pytest -n auto -m "spark" --reruns 3 --junitxml=test-results/junit-spark.xml && pytest -n auto -m "pyspark" --reruns 3 --junitxml=test-results/junit-pyspark.xml
192192

193193
trino-test: engine-trino-up
194-
pytest -n auto -m "trino" --retries 3 --junitxml=test-results/junit-trino.xml
194+
pytest -n auto -m "trino" --reruns 3 --junitxml=test-results/junit-trino.xml
195195

196196
risingwave-test: engine-risingwave-up
197-
pytest -n auto -m "risingwave" --retries 3 --junitxml=test-results/junit-risingwave.xml
197+
pytest -n auto -m "risingwave" --reruns 3 --junitxml=test-results/junit-risingwave.xml
198198

199199
#################
200200
# Cloud Engines #
201201
#################
202202

203203
snowflake-test: guard-SNOWFLAKE_ACCOUNT guard-SNOWFLAKE_WAREHOUSE guard-SNOWFLAKE_DATABASE guard-SNOWFLAKE_USER guard-SNOWFLAKE_PASSWORD engine-snowflake-install
204-
pytest -n auto -m "snowflake" --retries 3 --junitxml=test-results/junit-snowflake.xml
204+
pytest -n auto -m "snowflake" --reruns 3 --junitxml=test-results/junit-snowflake.xml
205205

206206
bigquery-test: guard-BIGQUERY_KEYFILE engine-bigquery-install
207207
$(PIP) install -e ".[bigframes]"
208-
pytest -n auto -m "bigquery" --retries 3 --junitxml=test-results/junit-bigquery.xml
208+
pytest -n auto -m "bigquery" --reruns 3 --junitxml=test-results/junit-bigquery.xml
209209

210210
databricks-test: guard-DATABRICKS_CATALOG guard-DATABRICKS_SERVER_HOSTNAME guard-DATABRICKS_HTTP_PATH guard-DATABRICKS_ACCESS_TOKEN guard-DATABRICKS_CONNECT_VERSION engine-databricks-install
211211
$(PIP) install 'databricks-connect==${DATABRICKS_CONNECT_VERSION}'
212-
pytest -n auto -m "databricks" --retries 3 --junitxml=test-results/junit-databricks.xml
212+
pytest -n auto -m "databricks" --reruns 3 --junitxml=test-results/junit-databricks.xml
213213

214214
redshift-test: guard-REDSHIFT_HOST guard-REDSHIFT_USER guard-REDSHIFT_PASSWORD guard-REDSHIFT_DATABASE engine-redshift-install
215-
pytest -n auto -m "redshift" --retries 3 --junitxml=test-results/junit-redshift.xml
215+
pytest -n auto -m "redshift" --reruns 3 --junitxml=test-results/junit-redshift.xml
216216

217217
clickhouse-cloud-test: guard-CLICKHOUSE_CLOUD_HOST guard-CLICKHOUSE_CLOUD_USERNAME guard-CLICKHOUSE_CLOUD_PASSWORD engine-clickhouse-install
218-
pytest -n 1 -m "clickhouse_cloud" --retries 3 --junitxml=test-results/junit-clickhouse-cloud.xml
218+
pytest -n 1 -m "clickhouse_cloud" --reruns 3 --junitxml=test-results/junit-clickhouse-cloud.xml
219219

220220
athena-test: guard-AWS_ACCESS_KEY_ID guard-AWS_SECRET_ACCESS_KEY guard-ATHENA_S3_WAREHOUSE_LOCATION engine-athena-install
221-
pytest -n auto -m "athena" --retries 3 --junitxml=test-results/junit-athena.xml
221+
pytest -n auto -m "athena" --reruns 3 --junitxml=test-results/junit-athena.xml
222222

223223
fabric-test: guard-FABRIC_HOST guard-FABRIC_CLIENT_ID guard-FABRIC_CLIENT_SECRET guard-FABRIC_DATABASE engine-fabric-install
224-
pytest -n auto -m "fabric" --retries 3 --junitxml=test-results/junit-fabric.xml
224+
pytest -n auto -m "fabric" --reruns 3 --junitxml=test-results/junit-fabric.xml
225225

226226
gcp-postgres-test: guard-GCP_POSTGRES_INSTANCE_CONNECTION_STRING guard-GCP_POSTGRES_USER guard-GCP_POSTGRES_PASSWORD guard-GCP_POSTGRES_KEYFILE_JSON engine-gcppostgres-install
227-
pytest -n auto -m "gcp_postgres" --retries 3 --junitxml=test-results/junit-gcp-postgres.xml
227+
pytest -n auto -m "gcp_postgres" --reruns 3 --junitxml=test-results/junit-gcp-postgres.xml
228228

229229
vscode_settings:
230230
mkdir -p .vscode

docs/integrations/dbt.md

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ This section describes how to adapt dbt's incremental models to run on sqlmesh a
219219
SQLMesh supports two approaches to implement [idempotent](../concepts/glossary.md#idempotency) incremental loads:
220220

221221
* Using merge (with the sqlmesh [`INCREMENTAL_BY_UNIQUE_KEY` model kind](../concepts/models/model_kinds.md#incremental_by_unique_key))
222-
* Using insert-overwrite/delete+insert (with the sqlmesh [`INCREMENTAL_BY_TIME_RANGE` model kind](../concepts/models/model_kinds.md#incremental_by_time_range))
222+
* Using [`INCREMENTAL_BY_TIME_RANGE` model kind](../concepts/models/model_kinds.md#incremental_by_time_range)
223223

224224
#### Incremental by unique key
225225

@@ -233,28 +233,22 @@ To enable incremental_by_unique_key incrementality, the model configuration shou
233233

234234
#### Incremental by time range
235235

236-
To enable incremental_by_time_range incrementality, the model configuration should contain:
236+
To enable incremental_by_time_range incrementality, the model configuration must contain:
237237

238-
* The `time_column` key with the model's time column field name as the value (see [`time column`](../concepts/models/model_kinds.md#time-column) for details)
239238
* The `materialized` key with value `'incremental'`
240-
* Either:
241-
* The `incremental_strategy` key with value `'insert_overwrite'` or
242-
* The `incremental_strategy` key with value `'delete+insert'`
243-
* Note: in this context, these two strategies are synonyms. Regardless of which one is specified SQLMesh will use the [`best incremental strategy`](../concepts/models/model_kinds.md#materialization-strategy) for the target engine.
239+
* The `incremental_strategy` key with the value `incremental_by_time_range`
240+
* The `time_column` key with the model's time column field name as the value (see [`time column`](../concepts/models/model_kinds.md#time-column) for details)
244241

245242
### Incremental logic
246243

247-
SQLMesh requires a new jinja block gated by `{% if sqlmesh_incremental is defined %}`. The new block should supersede the existing `{% if is_incremental() %}` block and contain the `WHERE` clause selecting the time interval.
244+
Unlike dbt incremental strategies, SQLMesh does not require the use of `is_incremental` jinja blocks to implement incremental logic.
245+
Instead, SQLMesh provides predefined time macro variables that can be used in the model's SQL to filter data based on the time column.
248246

249247
For example, the SQL `WHERE` clause with the "ds" column goes in a new jinja block gated by `{% if sqlmesh_incremental is defined %}` as follows:
250248

251249
```bash
252-
> {% if sqlmesh_incremental is defined %}
253250
> WHERE
254251
> ds BETWEEN '{{ start_ds }}' AND '{{ end_ds }}'
255-
> {% elif is_incremental() %}
256-
> ; < your existing is_incremental block >
257-
> {% endif %}
258252
```
259253

260254
`{{ start_ds }}` and `{{ end_ds }}` are the jinja equivalents of SQLMesh's `@start_ds` and `@end_ds` predefined time macro variables. See all [predefined time variables](../concepts/macros/macro_variables.md) available in jinja.
@@ -263,13 +257,11 @@ For example, the SQL `WHERE` clause with the "ds" column goes in a new jinja blo
263257

264258
SQLMesh provides configuration parameters that enable control over how incremental computations occur. These parameters are set in the model's `config` block.
265259

266-
The [`batch_size` parameter](../concepts/models/overview.md#batch_size) determines the maximum number of time intervals to run in a single job.
267-
268-
The [`lookback` parameter](../concepts/models/overview.md#lookback) is used to capture late arriving data. It sets the number of units of late arriving data the model should expect and must be a positive integer.
260+
See [Incremental Model Properties](../concepts/models/overview.md#incremental-model-properties) for the full list of incremental model configuration parameters.
269261

270262
**Note:** By default, all incremental dbt models are configured to be [forward-only](../concepts/plans.md#forward-only-plans). However, you can change this behavior by setting the `forward_only: false` setting either in the configuration of an individual model or globally for all models in the `dbt_project.yaml` file. The [forward-only](../concepts/plans.md#forward-only-plans) mode aligns more closely with the typical operation of dbt and therefore better meets user's expectations.
271263

272-
Similarly, the [allow_partials](../concepts/models/overview.md#allow_partials) parameter is set to `true` by default for incremental dbt models unless the time column is specified, or the `allow_partials` parameter is explicitly set to `false` in the model configuration.
264+
Similarly, the [allow_partials](../concepts/models/overview.md#allow_partials) parameter is set to `true` by default unless the `allow_partials` parameter is explicitly set to `false` in the model configuration.
273265

274266
#### on_schema_change
275267

docs/integrations/engines/trino.md

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -81,19 +81,21 @@ hive.metastore.glue.default-warehouse-dir=s3://my-bucket/
8181

8282
### Connection options
8383

84-
| Option | Description | Type | Required |
85-
|----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------:|:--------:|
86-
| `type` | Engine type name - must be `trino` | string | Y |
87-
| `user` | The username (of the account) to log in to your cluster. When connecting to Starburst Galaxy clusters, you must include the role of the user as a suffix to the username. | string | Y |
88-
| `host` | The hostname of your cluster. Don't include the `http://` or `https://` prefix. | string | Y |
89-
| `catalog` | The name of a catalog in your cluster. | string | Y |
90-
| `http_scheme` | The HTTP scheme to use when connecting to your cluster. By default, it's `https` and can only be `http` for no-auth or basic auth. | string | N |
91-
| `port` | The port to connect to your cluster. By default, it's `443` for `https` scheme and `80` for `http` | int | N |
92-
| `roles` | Mapping of catalog name to a role | dict | N |
93-
| `http_headers` | Additional HTTP headers to send with each request. | dict | N |
94-
| `session_properties` | Trino session properties. Run `SHOW SESSION` to see all options. | dict | N |
95-
| `retries` | Number of retries to attempt when a request fails. Default: `3` | int | N |
96-
| `timezone` | Timezone to use for the connection. Default: client-side local timezone | string | N |
84+
| Option | Description | Type | Required |
85+
|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------:|:--------:|
86+
| `type` | Engine type name - must be `trino` | string | Y |
87+
| `user` | The username (of the account) to log in to your cluster. When connecting to Starburst Galaxy clusters, you must include the role of the user as a suffix to the username. | string | Y |
88+
| `host` | The hostname of your cluster. Don't include the `http://` or `https://` prefix. | string | Y |
89+
| `catalog` | The name of a catalog in your cluster. | string | Y |
90+
| `http_scheme` | The HTTP scheme to use when connecting to your cluster. By default, it's `https` and can only be `http` for no-auth or basic auth. | string | N |
91+
| `port` | The port to connect to your cluster. By default, it's `443` for `https` scheme and `80` for `http` | int | N |
92+
| `roles` | Mapping of catalog name to a role | dict | N |
93+
| `http_headers` | Additional HTTP headers to send with each request. | dict | N |
94+
| `session_properties` | Trino session properties. Run `SHOW SESSION` to see all options. | dict | N |
95+
| `retries` | Number of retries to attempt when a request fails. Default: `3` | int | N |
96+
| `timezone` | Timezone to use for the connection. Default: client-side local timezone | string | N |
97+
| `schema_location_mapping` | A mapping of regex patterns to S3 locations to use for the `LOCATION` property when creating schemas. See [Table and Schema locations](#table-and-schema-locations) for more details. | dict | N |
98+
| `catalog_type_overrides` | A mapping of catalog names to their connector type. This is used to enable/disable connector specific behavior. See [Catalog Type Overrides](#catalog-type-overrides) for more details. | dict | N |
9799

98100
## Table and Schema locations
99101

@@ -204,6 +206,25 @@ SELECT ...
204206

205207
This will cause SQLMesh to set the specified `LOCATION` when issuing a `CREATE TABLE` statement.
206208

209+
## Catalog Type Overrides
210+
211+
SQLMesh attempts to determine the connector type of a catalog by querying the `system.metadata.catalogs` table and checking the `connector_name` column.
212+
It checks if the connector name is `hive` for Hive connector behavior or contains `iceberg` or `delta_lake` for Iceberg or Delta Lake connector behavior respectively.
213+
However, the connector name may not always be a reliable way to determine the connector type, for example when using a custom connector or a fork of an existing connector.
214+
To handle such cases, you can use the `catalog_type_overrides` connection property to explicitly specify the connector type for specific catalogs.
215+
For example, to specify that the `datalake` catalog is using the Iceberg connector and the `analytics` catalog is using the Hive connector, you can configure the connection as follows:
216+
217+
```yaml title="config.yaml"
218+
gateways:
219+
trino:
220+
connection:
221+
type: trino
222+
...
223+
catalog_type_overrides:
224+
datalake: iceberg
225+
analytics: hive
226+
```
227+
207228
## Authentication
208229

209230
=== "No Auth"

examples/sushi_dbt/models/customer_revenue_by_day.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{{
22
config(
33
materialized='incremental',
4-
incremental_strategy='delete+insert',
4+
incremental_strategy='incremental_by_time_range',
55
cluster_by=['ds'],
66
time_column='ds',
77
)

examples/sushi_dbt/models/waiter_as_customer_by_day.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{{
22
config(
33
materialized='incremental',
4-
incremental_strategy='delete+insert',
4+
incremental_strategy='incremental_by_time_range',
55
cluster_by=['ds'],
66
time_column='ds',
77
)

examples/sushi_dbt/models/waiter_revenue_by_day.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{{
22
config(
33
materialized='incremental',
4-
incremental_strategy='delete+insert',
4+
incremental_strategy='incremental_by_time_range',
55
cluster_by=['ds'],
66
time_column='ds',
77
)

examples/sushi_dbt/models/waiter_revenue_by_day_v1.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{{
22
config(
33
materialized='incremental',
4-
incremental_strategy='delete+insert',
4+
incremental_strategy='incremental_by_time_range',
55
cluster_by=['ds'],
66
time_column='ds',
77
)

0 commit comments

Comments
 (0)