diff --git a/docs/overture-examples.ipynb b/docs/overture-examples.ipynb index f542a6d08..f312d8619 100644 --- a/docs/overture-examples.ipynb +++ b/docs/overture-examples.ipynb @@ -28,7 +28,7 @@ "\n", "> Note: Before running this notebook, ensure that you have installed SedonaDB: `pip install \"apache-sedona[db]\"`\n", "\n", - "This notebook demonstrates how to query and analyze the [Overture Maps](https://overturemaps.org/) dataset using SedonaDB. See [this page](https://docs.overturemaps.org/release-calendar/) to get the latest version of the Overture data.\n", + "This notebook demonstrates how to query and analyze the [Overture Maps](https://overturemaps.org/) dataset using SedonaDB. \n", "\n", "The notebook explains how to:\n", "\n", @@ -39,534 +39,429 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "c5e580ff", - "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: lonboard in /opt/miniconda3/lib/python3.12/site-packages (0.12.1)\n", - "Requirement already satisfied: anywidget~=0.9.0 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.9.18)\n", - "Requirement already satisfied: arro3-compute>=0.4.1 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.6.3)\n", - "Requirement already satisfied: arro3-core>=0.4.1 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.6.3)\n", - "Requirement already satisfied: arro3-io>=0.4.1 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.6.3)\n", - "Requirement already satisfied: geoarrow-rust-core>=0.5.2 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.5.2)\n", - "Requirement already satisfied: ipywidgets>=7.6.0 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (8.1.7)\n", - "Requirement already satisfied: numpy>=1.14 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (2.3.3)\n", - "Requirement already satisfied: pyproj>=3.3 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (3.7.2)\n", - "Requirement already satisfied: traitlets>=5.7.1 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (5.14.3)\n", - "Requirement already satisfied: psygnal>=0.8.1 in /opt/miniconda3/lib/python3.12/site-packages (from anywidget~=0.9.0->lonboard) (0.14.1)\n", - "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/miniconda3/lib/python3.12/site-packages (from anywidget~=0.9.0->lonboard) (4.15.0)\n", - "Requirement already satisfied: comm>=0.1.3 in /opt/miniconda3/lib/python3.12/site-packages (from ipywidgets>=7.6.0->lonboard) (0.2.3)\n", - "Requirement already satisfied: ipython>=6.1.0 in /opt/miniconda3/lib/python3.12/site-packages (from ipywidgets>=7.6.0->lonboard) (9.5.0)\n", - "Requirement already satisfied: widgetsnbextension~=4.0.14 in /opt/miniconda3/lib/python3.12/site-packages (from ipywidgets>=7.6.0->lonboard) (4.0.14)\n", - "Requirement already satisfied: jupyterlab_widgets~=3.0.15 in /opt/miniconda3/lib/python3.12/site-packages (from ipywidgets>=7.6.0->lonboard) (3.0.15)\n", - "Requirement already satisfied: certifi in /opt/miniconda3/lib/python3.12/site-packages (from pyproj>=3.3->lonboard) (2025.8.3)\n", - "Requirement already satisfied: decorator in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (5.2.1)\n", - "Requirement already satisfied: ipython-pygments-lexers in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (1.1.1)\n", - "Requirement already satisfied: jedi>=0.16 in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.19.2)\n", - "Requirement already satisfied: matplotlib-inline in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.1.7)\n", - "Requirement already satisfied: pexpect>4.3 in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (4.9.0)\n", - "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (3.0.52)\n", - "Requirement already satisfied: pygments>=2.4.0 in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (2.19.2)\n", - "Requirement already satisfied: stack_data in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.6.3)\n", - "Requirement already satisfied: parso<0.9.0,>=0.8.4 in /opt/miniconda3/lib/python3.12/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.8.5)\n", - "Requirement already satisfied: ptyprocess>=0.5 in /opt/miniconda3/lib/python3.12/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.7.0)\n", - "Requirement already satisfied: wcwidth in /opt/miniconda3/lib/python3.12/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.2.14)\n", - "Requirement already satisfied: executing>=1.2.0 in /opt/miniconda3/lib/python3.12/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (2.2.1)\n", - "Requirement already satisfied: asttokens>=2.1.0 in /opt/miniconda3/lib/python3.12/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (3.0.0)\n", - "Requirement already satisfied: pure-eval in /opt/miniconda3/lib/python3.12/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.2.3)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "%pip install lonboard" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "6d6fa0ab-b4ed-4e60-b099-a1af0998b486", "metadata": {}, "outputs": [], "source": [ "import sedona.db\n", - "import os\n", - "\n", - "os.environ[\"AWS_SKIP_SIGNATURE\"] = \"true\"\n", - "os.environ[\"AWS_DEFAULT_REGION\"] = \"us-west-2\"\n", "\n", "sd = sedona.db.connect()" ] }, { "cell_type": "markdown", - "id": "4f44adfb-2973-4a65-b4f2-d24b28700b79", + "id": "4d7e32aa", "metadata": {}, "source": [ - "## Overture buildings table" + "## Overture divisions\n", + "\n", + "Let's load a table! Like any local or remote collection of Parquet files, we can use `sd.read_parquet()`. This is a lazy operation, fetching only metadata required to calculate a table schema. To reduce the number of times this needs to happen (and make the resulting DataFrame easier to reference in SQL), we use `.to_view()`.\n", + "\n", + "> Overture removes old releases. See [this page](https://docs.overturemaps.org/release-calendar/#current-release) to see the latest version number and replace the relevant portion of the URL below." ] }, { "cell_type": "code", - "execution_count": 3, - "id": "52855769-4872-472a-9c42-afced3d85ca8", + "execution_count": 2, + "id": "a205670e", "metadata": {}, "outputs": [], "source": [ - "df = sd.read_parquet(\n", - " \"s3://overturemaps-us-west-2/release/2025-11-19.0/theme=buildings/type=building/\"\n", - ")" + "sd.read_parquet(\n", + " \"s3://overturemaps-us-west-2/release/2026-02-18.0/theme=divisions/type=division_area/\",\n", + " options={\"aws.skip_signature\": True, \"aws.region\": \"us-west-2\"},\n", + ").to_view(\"divisions\")" ] }, { - "cell_type": "code", - "execution_count": 4, - "id": "b45b5e5c-64ed-49ba-a8aa-9f2292f617c6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "┌──────────────────────────────────────┬─────────────────────────────────────────┬───┬─────────────┐\n", - "│ id ┆ geometry ┆ … ┆ roof_height │\n", - "│ utf8 ┆ geometry ┆ ┆ float64 │\n", - "╞══════════════════════════════════════╪═════════════════════════════════════════╪═══╪═════════════╡\n", - "│ 85b47da4-1b8d-4132-ac6c-d8dc14fab4b8 ┆ POLYGON((-6.4292972 54.8290034,-6.4291… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ ec12e345-d44d-4e40-8e08-e1e6e68d4d17 ┆ POLYGON((-6.430836 54.8299412,-6.43095… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 285f9ff9-2d6d-409c-b214-74992c8d7e7d ┆ POLYGON((-6.4311579 54.8300247,-6.4313… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ abedfc7c-e5fd-4a29-931e-da77b610d02d ┆ POLYGON((-6.4321833 54.8294427,-6.4322… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ a203a2c6-e130-4979-a7d5-8a059c6f31fd ┆ POLYGON((-6.4300627 54.829276,-6.43006… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 1d14caf6-b12d-486e-87dd-feef82fba9a7 ┆ POLYGON((-6.4301786 54.8281533,-6.4299… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 4b1e67cf-7355-439b-9a31-46a50f3ee227 ┆ POLYGON((-6.4298614 54.8278977,-6.4299… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 06de994e-efd4-4a1c-8a20-b4e883904cb2 ┆ POLYGON((-6.4296383 54.827599,-6.42956… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ ea0b2ea6-7c52-4395-9baa-bc023c7d3166 ┆ POLYGON((-6.4296844 54.8277379,-6.4296… ┆ … ┆ │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 49f022ef-5574-4613-ae54-af139666fde3 ┆ POLYGON((-6.4296843 54.8278169,-6.4296… ┆ … ┆ │\n", - "└──────────────────────────────────────┴─────────────────────────────────────────┴───┴─────────────┘\n" - ] - } - ], - "source": [ - "df.limit(10).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e37a023e-0e80-403a-a65b-b5a190004f72", + "cell_type": "markdown", + "id": "83a37848", "metadata": {}, - "outputs": [], "source": [ - "df.to_view(\"buildings\")" + "We can preview the first few rows using `.show()`. Because this is a lazy operation and we've already cached the schema using `.to_view()`, this only takes a few seconds." ] }, { "cell_type": "code", - "execution_count": 6, - "id": "ebfe4776-e08f-4f38-97fc-fca8ec6fc364", + "execution_count": 3, + "id": "62c19bf2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "┌────────────┐\n", - "│ count(*) │\n", - "│ int64 │\n", - "╞════════════╡\n", - "│ 2541497985 │\n", - "└────────────┘\n" + "┌───────────────┬───────────────┬──────────────┬─────────┬───┬────────┬─────────────┬──────────────┐\n", + "│ id ┆ geometry ┆ bbox ┆ country ┆ … ┆ region ┆ admin_level ┆ division_id │\n", + "│ utf8 ┆ geometry ┆ struct ┆ utf8 ┆ ┆ utf8 ┆ int32 ┆ utf8 │\n", + "╞═══════════════╪═══════════════╪══════════════╪═════════╪═══╪════════╪═════════════╪══════════════╡\n", + "│ a5c573c4-022… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 388a8056-ee… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ cf523f8c-c26… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 068ef37e-3b… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ 8ace3d06-b8a… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 7238aeb3-b8… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ b26d2cba-b54… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 3c2dc8fc-79… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ 20103725-17c… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 45037e82-de… │\n", + "└───────────────┴───────────────┴──────────────┴─────────┴───┴────────┴─────────────┴──────────────┘\n" ] } ], "source": [ - "# the buildings table is large and contains billions of rows\n", - "sd.sql(\"\"\"\n", - "SELECT\n", - " COUNT(*)\n", - "FROM\n", - " buildings\n", - "\"\"\").show()" + "sd.view(\"divisions\").show(5)" + ] + }, + { + "cell_type": "markdown", + "id": "06fa447a", + "metadata": {}, + "source": [ + "The default view of the data hides some columns to ensure the entire output can be shown. To look at all the columns with type details, use `.schema`:" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "b73f670d-0d10-4a7a-bfc7-e2abe5d9edd2", + "execution_count": 4, + "id": "471fd72f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "SedonaSchema with 24 fields:\n", + "SedonaSchema with 14 fields:\n", " id: utf8\n", " geometry: geometry\n", - " bbox: struct\n", + " bbox: struct\n", + " country: utf8\n", " version: int32\n", - " sources: list\n", - " level: int32\n", + " sources: list\n", " subtype: utf8\n", " class: utf8\n", - " height: float64\n", - " names: struct\n", - " has_parts: boolean\n", - " is_underground: boolean\n", - " num_floors: int32\n", - " num_floors_underground: int32\n", - " min_height: float64\n", - " min_floor: int32\n", - " facade_color: utf8\n", - " facade_material: utf8\n", - " roof_material: utf8\n", - " roof_shape: utf8\n", - " roof_direction: float64\n", - " roof_orientation: utf8\n", - " roof_color: utf8\n", - " roof_height: float64" + " names: struct\n", + " is_land: boolean\n", + " is_territorial: boolean\n", + " region: utf8\n", + " admin_level: int32\n", + " division_id: utf8" ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# check out the schema of the buildings table to see what it contains\n", - "df.schema" + "sd.view(\"divisions\").schema" + ] + }, + { + "cell_type": "markdown", + "id": "925a4b10", + "metadata": {}, + "source": [ + "Overture data makes heavy use of nested types. These can be indexed into or expanded using SQL:" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "68d1b68c-dd26-45c2-944f-61138b212943", + "execution_count": 5, + "id": "85710387", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "┌─────────────────────────┬────────────────────┬────────────┬────────────┬─────────────────────────┐\n", - "│ id ┆ height ┆ num_floors ┆ roof_shape ┆ centroid │\n", - "│ utf8 ┆ float64 ┆ int32 ┆ utf8 ┆ geometry │\n", - "╞═════════════════════════╪════════════════════╪════════════╪════════════╪═════════════════════════╡\n", - "│ aa8e3a73-c72c-4f1a-b6e… ┆ 20.38205909729004 ┆ ┆ ┆ POINT(-74.187673580307… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ efe7616b-7f7e-464c-9ce… ┆ 26.18361473083496 ┆ ┆ ┆ POINT(-74.189040982134… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ b3f734a1-325b-4e8c-b1d… ┆ 27.025876998901367 ┆ ┆ ┆ POINT(-74.2558161 40.8… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 45d88655-e2f4-4a08-926… ┆ 25.485210418701172 ┆ ┆ ┆ POINT(-74.182252194444… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 31e8353c-7d5b-4b20-94e… ┆ 21.294815063476562 ┆ ┆ ┆ POINT(-74.197113787905… │\n", - "└─────────────────────────┴────────────────────┴────────────┴────────────┴─────────────────────────┘\n" + "┌────────────────────────────────────┬─────────────────────────────────────────────────────────────┐\n", + "│ name ┆ geometry │\n", + "│ utf8 ┆ geometry │\n", + "╞════════════════════════════════════╪═════════════════════════════════════════════════════════════╡\n", + "│ Sable Island National Park Reserve ┆ POLYGON((-60.178333 43.9824655,-60.1785682 43.9825425,-60.… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Sable Island ┆ POLYGON((-59.7744732 44.2254616,-59.7928902 44.2173253,-59… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Halifax Regional Municipality ┆ MULTIPOLYGON(((-59.7321078 44.2390248,-59.7502166 44.23385… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ West Liscomb ┆ POLYGON((-62.0615594 45.0023306,-62.0621839 45.0024475,-62… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Marie Joseph ┆ POLYGON((-61.9911914 44.95646,-61.9912383 44.9579526,-61.9… │\n", + "└────────────────────────────────────┴─────────────────────────────────────────────────────────────┘\n" ] } ], "source": [ - "# find all the buildings in New York City that are taller than 20 meters\n", - "nyc_bbox_wkt = (\n", - " \"POLYGON((-74.2591 40.4774, -74.2591 40.9176, -73.7004 40.9176, \"\n", - " \"-73.7004 40.4774, -74.2591 40.4774))\"\n", - ")\n", - "sd.sql(f\"\"\"\n", - "SELECT\n", - " id,\n", - " height,\n", - " num_floors,\n", - " roof_shape,\n", - " ST_Centroid(geometry) as centroid\n", - "FROM\n", - " buildings\n", - "WHERE\n", - " is_underground = FALSE\n", - " AND height IS NOT NULL\n", - " AND height > 20\n", - " AND ST_Intersects(\n", - " geometry,\n", - " ST_GeomFromText('{nyc_bbox_wkt}', 4326)\n", - " )\n", - "LIMIT 5;\n", - "\"\"\").show()" + "sd.sql(\n", + " \"SELECT names.primary AS name, geometry FROM divisions WHERE region = 'CA-NS'\"\n", + ").show(5)" ] }, { "cell_type": "markdown", - "id": "e07fcdc1-962b-4dce-90cb-bf715432e299", + "id": "3141209b", "metadata": {}, "source": [ - "## Overture divisions table" + "Like all remote tables, it is worth resolving a query into a concrete local table to avoid fetching unnecessary data on repeated queries. The `.to_memtable()` method can be used to resolve a remote table into memory (great for small results); `.to_parquet()` can be used to resolve a remote table to disk (great for medium to large results)." ] }, { "cell_type": "code", - "execution_count": 9, - "id": "d9f122d3-4d90-46b0-ab9a-259a71cc423b", - "metadata": {}, - "outputs": [], - "source": [ - "df = sd.read_parquet(\n", - " \"s3://overturemaps-us-west-2/release/2025-11-19.0/theme=divisions/type=division_area/\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "19a75b19-0b56-4167-b3f1-73a171ecc480", + "execution_count": 6, + "id": "25aae0de", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "┌─────────────────┬────────────────┬────────────────┬───┬────────────────┬────────┬────────────────┐\n", - "│ id ┆ geometry ┆ bbox ┆ … ┆ is_territorial ┆ region ┆ division_id │\n", - "│ utf8 ┆ geometry ┆ struct ┆ ┆ boolean ┆ utf8 ┆ utf8 │\n", - "╞═════════════════╪════════════════╪════════════════╪═══╪════════════════╪════════╪════════════════╡\n", - "│ 3665c36d-d3a9-… ┆ POLYGON((12.5… ┆ {xmin: 12.455… ┆ … ┆ true ┆ IT-34 ┆ f05aa29f-151f… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 18a69439-a1da-… ┆ POLYGON((12.5… ┆ {xmin: 12.596… ┆ … ┆ true ┆ IT-36 ┆ ae00d58c-6e67… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 7d0f6d37-bb55-… ┆ POLYGON((12.6… ┆ {xmin: 12.567… ┆ … ┆ true ┆ IT-36 ┆ bdfc82ca-5f23… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 3f480ff6-6361-… ┆ POLYGON((12.5… ┆ {xmin: 12.549… ┆ … ┆ true ┆ IT-36 ┆ 1c750104-4470… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 31c3ab5e-eb6f-… ┆ POLYGON((12.6… ┆ {xmin: 12.612… ┆ … ┆ true ┆ IT-34 ┆ d90804ee-19a4… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 308517e6-64b4-… ┆ POLYGON((12.5… ┆ {xmin: 12.589… ┆ … ┆ true ┆ IT-34 ┆ aabd71e9-4d98… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 646e5b1f-b76a-… ┆ POLYGON((12.5… ┆ {xmin: 12.485… ┆ … ┆ true ┆ IT-34 ┆ 502c1c4e-fc19… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ f2809a49-1082-… ┆ POLYGON((12.5… ┆ {xmin: 12.538… ┆ … ┆ true ┆ IT-34 ┆ 8b446eed-00ad… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 72b27245-c7fd-… ┆ POLYGON((12.5… ┆ {xmin: 12.501… ┆ … ┆ true ┆ IT-34 ┆ 1d535e1f-d19e… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ 815855d9-05d0-… ┆ POLYGON((12.4… ┆ {xmin: 12.371… ┆ … ┆ true ┆ IT-34 ┆ 5aa91354-9e8c… │\n", - "└─────────────────┴────────────────┴────────────────┴───┴────────────────┴────────┴────────────────┘\n" + "┌────────────────────────────────────┬─────────────────────────────────────────────────────────────┐\n", + "│ name ┆ geometry │\n", + "│ utf8 ┆ geometry │\n", + "╞════════════════════════════════════╪═════════════════════════════════════════════════════════════╡\n", + "│ Sable Island National Park Reserve ┆ POLYGON((-60.178333 43.9824655,-60.1785682 43.9825425,-60.… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Sable Island ┆ POLYGON((-59.7744732 44.2254616,-59.7928902 44.2173253,-59… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Halifax Regional Municipality ┆ MULTIPOLYGON(((-59.7321078 44.2390248,-59.7502166 44.23385… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ West Liscomb ┆ POLYGON((-62.0615594 45.0023306,-62.0621839 45.0024475,-62… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Marie Joseph ┆ POLYGON((-61.9911914 44.95646,-61.9912383 44.9579526,-61.9… │\n", + "└────────────────────────────────────┴─────────────────────────────────────────────────────────────┘\n" ] } ], "source": [ - "# inspect a few rows of the data\n", - "df.show(10)" + "sd.sql(\n", + " \"SELECT names.primary AS name, geometry FROM divisions WHERE region = 'CA-NS'\"\n", + ").to_memtable().to_view(\"divisions_ns\")\n", + "\n", + "sd.view(\"divisions_ns\").show(5)" ] }, { - "cell_type": "code", - "execution_count": 11, - "id": "03b951de-3397-4fcf-9baf-50e139a38dd4", + "cell_type": "markdown", + "id": "2828dfc0", "metadata": {}, - "outputs": [], "source": [ - "df.to_view(\"division_area\")" + "Importantly, Overture data is distributed using GeoParquet 1.1, for which SedonaDB has built in support! This means that spatial queries (e.g., `ST_Intersects()`) tend to execute quickly against overture. In this case, the spatial query for Nova Scotia is ~5x faster than the text-based region query." ] }, { "cell_type": "code", - "execution_count": 12, - "id": "9c6bd69d-9407-432a-bdc8-d60976237a3a", + "execution_count": 7, + "id": "f43824dc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "┌──────────┐\n", - "│ count(*) │\n", - "│ int64 │\n", - "╞══════════╡\n", - "│ 1052542 │\n", - "└──────────┘\n" + "┌───────────────────┬──────────────────────────────────────────────────────────────────────────────┐\n", + "│ name ┆ geometry │\n", + "│ utf8 ┆ geometry │\n", + "╞═══════════════════╪══════════════════════════════════════════════════════════════════════════════╡\n", + "│ Maces Bay ┆ POLYGON((-66.4491254 45.1265729,-66.4577261 45.126933,-66.4591563 45.126991… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Gooseberry Island ┆ POLYGON((-66.2598821 45.1380421,-66.2599962 45.1381233,-66.2600591 45.13828… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Musquash Parish ┆ POLYGON((-66.4595418 45.2215004,-66.4595406 45.221468,-66.4595396 45.221391… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Dipper Harbour ┆ POLYGON((-66.3755086 45.118812,-66.4089711 45.1488327,-66.4284252 45.138119… │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ Chance Harbour ┆ POLYGON((-66.4089711 45.1488327,-66.3755086 45.118812,-66.3541725 45.105991… │\n", + "└───────────────────┴──────────────────────────────────────────────────────────────────────────────┘\n" ] } ], "source": [ - "sd.sql(\"\"\"\n", - "SELECT\n", - " COUNT(*)\n", - "FROM division_area\n", - "\"\"\").show()" + "import shapely\n", + "\n", + "ns_bbox_wkb = shapely.box(-66.5, 43.4, -59.8, 47.1).wkb\n", + "\n", + "sd.sql(\n", + " \"\"\"\n", + " SELECT names.primary AS name, geometry\n", + " FROM divisions\n", + " WHERE ST_Contains(ST_GeomFromWKB($wkb, 4326), geometry)\n", + " \"\"\",\n", + " params={\"wkb\": ns_bbox_wkb},\n", + ").to_memtable().to_view(\"divisions_ns\", overwrite=True)\n", + "\n", + "sd.view(\"divisions_ns\").show(5)" ] }, { - "cell_type": "code", - "execution_count": 13, - "id": "75a6d0ed-9767-4d36-a77a-4afb7952fbe4", + "cell_type": "markdown", + "id": "4f44adfb-2973-4a65-b4f2-d24b28700b79", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "SedonaSchema with 13 fields:\n", - " id: utf8\n", - " geometry: geometry\n", - " bbox: struct\n", - " country: utf8\n", - " version: int32\n", - " sources: list\n", - " subtype: utf8\n", - " class: utf8\n", - " names: struct\n", - " is_land: boolean\n", - " is_territorial: boolean\n", - " region: utf8\n", - " division_id: utf8" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "df.schema" + "## Overture buildings table\n", + "\n", + "The [Overture buildings table](https://docs.overturemaps.org/guides/buildings/) is one of the largest tables provided by the Overture Maps Foundation. The workflow is similar to the division table or any remote table; however, the buildings table presents several unique challenges.\n", + "\n", + "First, the metadata size for all files in the buildings table is very large. SedonaDB caches remote metadata to avoid repeated download; however, the default cache size is too small. For repeated queries against the buildings table, ensure that the cache size is increased to at least 900 MB and/or `.to_view()` is used to cache the schema. The cache lives as long as the session...use `sd = sedona.db.connect()` or reset the cache size to a smaller value to clear the cache.\n", + "\n", + "> Overture removes old releases. See [this page](https://docs.overturemaps.org/release-calendar/#current-release) to see the latest version number and replace the relevant portion of the URL below." ] }, { "cell_type": "code", - "execution_count": 14, - "id": "f1f7158c-ef2b-4377-9bee-180309ddd553", + "execution_count": 8, + "id": "52855769-4872-472a-9c42-afced3d85ca8", "metadata": {}, "outputs": [], "source": [ - "# get all the divisions in Nova Scotia and save them in memory with to_memtable()\n", - "nova_scotia_bbox_wkt = (\n", - " \"POLYGON((-66.5 43.4, -66.5 47.1, -59.8 47.1, -59.8 43.4, -66.5 43.4))\"\n", - ")\n", - "ns = sd.sql(f\"\"\"\n", - "SELECT\n", - " country, region, names, geometry\n", - "FROM division_area\n", - "WHERE\n", - " ST_Intersects(\n", - " geometry,\n", - " ST_GeomFromText('{nova_scotia_bbox_wkt}', 4326)\n", - " )\n", - "\"\"\").to_memtable()" + "sd.sql(\"SET datafusion.runtime.metadata_cache_limit = '900M'\").execute()\n", + "\n", + "sd.read_parquet(\n", + " \"s3://overturemaps-us-west-2/release/2026-02-18.0/theme=buildings/type=building/\",\n", + " options={\"aws.skip_signature\": True, \"aws.region\": \"us-west-2\"},\n", + ").to_view(\"buildings\")" ] }, { - "cell_type": "code", - "execution_count": 15, - "id": "27e6909d-06fa-438b-88e0-d300fd2fb1ec", + "cell_type": "markdown", + "id": "fb89b55f", "metadata": {}, - "outputs": [], "source": [ - "ns.to_view(\"ns_divisions\")" + "Like all SedonaDB DataFrames, viewing a schema or previewing the first few rows are lazy and usually fast unless a query contains large aggregations or joins." ] }, { "cell_type": "code", - "execution_count": 16, - "id": "2dec92d8-a374-4021-990a-e50f5769516e", + "execution_count": 9, + "id": "b45b5e5c-64ed-49ba-a8aa-9f2292f617c6", "metadata": {}, - "outputs": [], - "source": [ - "df = sd.sql(\"\"\"\n", - "SELECT UNNEST(names), geometry\n", - "FROM ns_divisions\n", - "WHERE region = 'CA-NS'\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "7f39a283-0eee-4f72-a30a-8dd9fa1aaa69", - "metadata": { - "scrolled": true - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "┌────────────────────────┬────────────────────────┬────────────────────────┬───────────────────────┐\n", - "│ __unnest_placeholder(n ┆ __unnest_placeholder(n ┆ __unnest_placeholder(n ┆ geometry │\n", - "│ s_divisions.names).pr… ┆ s_divisions.names).co… ┆ s_divisions.names).ru… ┆ geometry │\n", - "╞════════════════════════╪════════════════════════╪════════════════════════╪═══════════════════════╡\n", - "│ Apple River ┆ ┆ ┆ POLYGON((-64.7260681… │\n", - "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", - "│ Allen Hill ┆ ┆ ┆ POLYGON((-64.6956656… │\n", - "└────────────────────────┴────────────────────────┴────────────────────────┴───────────────────────┘\n", - "CPU times: user 1.25 ms, sys: 805 μs, total: 2.05 ms\n", - "Wall time: 1.42 ms\n" + "┌──────────────────────────────────────┬─────────────────────────────────────────┬───┬─────────────┐\n", + "│ id ┆ geometry ┆ … ┆ roof_height │\n", + "│ utf8 ┆ geometry ┆ ┆ float64 │\n", + "╞══════════════════════════════════════╪═════════════════════════════════════════╪═══╪═════════════╡\n", + "│ ab23f7ee-4c05-4246-a016-8260ce58a916 ┆ POLYGON((-67.589523 -39.0908362,-67.58… ┆ … ┆ │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ 58356258-2e80-48fc-aacf-d81fcf74074c ┆ POLYGON((-67.5896327 -39.0907868,-67.5… ┆ … ┆ │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ b50595a8-cddb-44dd-bdbf-7bbe1e858ae0 ┆ POLYGON((-67.5897117 -39.0908483,-67.5… ┆ … ┆ │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ cbabe2df-f49a-4e9f-9cbe-c527a4b3b9f1 ┆ POLYGON((-67.5898768 -39.0907073,-67.5… ┆ … ┆ │\n", + "├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ bcd6984b-8da4-4dfe-9212-be2b02a24b67 ┆ POLYGON((-67.5901879 -39.0908288,-67.5… ┆ … ┆ │\n", + "└──────────────────────────────────────┴─────────────────────────────────────────┴───┴─────────────┘\n" ] } ], "source": [ - "%%time\n", - "# this executes quickly because the Nova Scotia data was persisted in memory with `to_memtable()`\n", - "df.show(2)" + "sd.view(\"buildings\").show(5)" ] }, { "cell_type": "markdown", - "id": "fc1d2023-c83a-4010-808b-212161b1b577", + "id": "caaa29c4", "metadata": {}, "source": [ - "## Visualize the results with lonboard" + "Some operations like `.count()` use summary statistics and execute quickly even for large remote tables:" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "f78583fd-a73a-4169-9c45-74d8026bb5fb", + "execution_count": 10, + "id": "e37a023e-0e80-403a-a65b-b5a190004f72", "metadata": {}, "outputs": [ { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "02a229b2c09f4acf8ae4daedcb8af8ae", - "version_major": 2, - "version_minor": 1 - }, "text/plain": [ - "Map(basemap_style= 20\n", + " AND ST_Intersects(\n", + " geometry,\n", + " ST_GeomFromText($1, 4326)\n", + " )\n", + " LIMIT 5;\n", + " \"\"\",\n", + " params=(nyc_bbox_wkt,),\n", + ").to_memtable().to_view(\"buildings_nyc\")\n", + "\n", + "sd.view(\"buildings_nyc\").show(5)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -580,7 +475,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.13.8" } }, "nbformat": 4, diff --git a/docs/overture-examples.md b/docs/overture-examples.md index 0762b3b87..ad5ad8e00 100644 --- a/docs/overture-examples.md +++ b/docs/overture-examples.md @@ -21,7 +21,7 @@ > Note: Before running this notebook, ensure that you have installed SedonaDB: `pip install "apache-sedona[db]"` -This notebook demonstrates how to query and analyze the [Overture Maps](https://overturemaps.org/) dataset using SedonaDB. See [this page](https://docs.overturemaps.org/release-calendar/) to get the latest version of the Overture data. +This notebook demonstrates how to query and analyze the [Overture Maps](https://overturemaps.org/) dataset using SedonaDB. The notebook explains how to: @@ -30,353 +30,268 @@ The notebook explains how to: * Optimize subsequent query performance by caching a subset of data in memory. -```python -%pip install lonboard -``` - - Requirement already satisfied: lonboard in /opt/miniconda3/lib/python3.12/site-packages (0.12.1) - Requirement already satisfied: anywidget~=0.9.0 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.9.18) - Requirement already satisfied: arro3-compute>=0.4.1 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.6.3) - Requirement already satisfied: arro3-core>=0.4.1 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.6.3) - Requirement already satisfied: arro3-io>=0.4.1 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.6.3) - Requirement already satisfied: geoarrow-rust-core>=0.5.2 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (0.5.2) - Requirement already satisfied: ipywidgets>=7.6.0 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (8.1.7) - Requirement already satisfied: numpy>=1.14 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (2.3.3) - Requirement already satisfied: pyproj>=3.3 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (3.7.2) - Requirement already satisfied: traitlets>=5.7.1 in /opt/miniconda3/lib/python3.12/site-packages (from lonboard) (5.14.3) - Requirement already satisfied: psygnal>=0.8.1 in /opt/miniconda3/lib/python3.12/site-packages (from anywidget~=0.9.0->lonboard) (0.14.1) - Requirement already satisfied: typing-extensions>=4.2.0 in /opt/miniconda3/lib/python3.12/site-packages (from anywidget~=0.9.0->lonboard) (4.15.0) - Requirement already satisfied: comm>=0.1.3 in /opt/miniconda3/lib/python3.12/site-packages (from ipywidgets>=7.6.0->lonboard) (0.2.3) - Requirement already satisfied: ipython>=6.1.0 in /opt/miniconda3/lib/python3.12/site-packages (from ipywidgets>=7.6.0->lonboard) (9.5.0) - Requirement already satisfied: widgetsnbextension~=4.0.14 in /opt/miniconda3/lib/python3.12/site-packages (from ipywidgets>=7.6.0->lonboard) (4.0.14) - Requirement already satisfied: jupyterlab_widgets~=3.0.15 in /opt/miniconda3/lib/python3.12/site-packages (from ipywidgets>=7.6.0->lonboard) (3.0.15) - Requirement already satisfied: certifi in /opt/miniconda3/lib/python3.12/site-packages (from pyproj>=3.3->lonboard) (2025.8.3) - Requirement already satisfied: decorator in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (5.2.1) - Requirement already satisfied: ipython-pygments-lexers in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (1.1.1) - Requirement already satisfied: jedi>=0.16 in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.19.2) - Requirement already satisfied: matplotlib-inline in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.1.7) - Requirement already satisfied: pexpect>4.3 in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (4.9.0) - Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (3.0.52) - Requirement already satisfied: pygments>=2.4.0 in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (2.19.2) - Requirement already satisfied: stack_data in /opt/miniconda3/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.6.3) - Requirement already satisfied: parso<0.9.0,>=0.8.4 in /opt/miniconda3/lib/python3.12/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.8.5) - Requirement already satisfied: ptyprocess>=0.5 in /opt/miniconda3/lib/python3.12/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.7.0) - Requirement already satisfied: wcwidth in /opt/miniconda3/lib/python3.12/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.2.14) - Requirement already satisfied: executing>=1.2.0 in /opt/miniconda3/lib/python3.12/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (2.2.1) - Requirement already satisfied: asttokens>=2.1.0 in /opt/miniconda3/lib/python3.12/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (3.0.0) - Requirement already satisfied: pure-eval in /opt/miniconda3/lib/python3.12/site-packages (from stack_data->ipython>=6.1.0->ipywidgets>=7.6.0->lonboard) (0.2.3) - Note: you may need to restart the kernel to use updated packages. - - - ```python import sedona.db -import os - -os.environ["AWS_SKIP_SIGNATURE"] = "true" -os.environ["AWS_DEFAULT_REGION"] = "us-west-2" sd = sedona.db.connect() ``` -## Overture buildings table +## Overture divisions +Let's load a table! Like any local or remote collection of Parquet files, we can use `sd.read_parquet()`. This is a lazy operation, fetching only metadata required to calculate a table schema. To reduce the number of times this needs to happen (and make the resulting DataFrame easier to reference in SQL), we use `.to_view()`. -```python -df = sd.read_parquet( - "s3://overturemaps-us-west-2/release/2025-11-19.0/theme=buildings/type=building/" -) -``` +> Overture removes old releases. See [this page](https://docs.overturemaps.org/release-calendar/#current-release) to see the latest version number and replace the relevant portion of the URL below. ```python -df.limit(10).show() +sd.read_parquet( + "s3://overturemaps-us-west-2/release/2026-02-18.0/theme=divisions/type=division_area/", + options={"aws.skip_signature": True, "aws.region": "us-west-2"}, +).to_view("divisions") ``` - ┌──────────────────────────────────────┬─────────────────────────────────────────┬───┬─────────────┐ - │ id ┆ geometry ┆ … ┆ roof_height │ - │ utf8 ┆ geometry ┆ ┆ float64 │ - ╞══════════════════════════════════════╪═════════════════════════════════════════╪═══╪═════════════╡ - │ 85b47da4-1b8d-4132-ac6c-d8dc14fab4b8 ┆ POLYGON((-6.4292972 54.8290034,-6.4291… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ ec12e345-d44d-4e40-8e08-e1e6e68d4d17 ┆ POLYGON((-6.430836 54.8299412,-6.43095… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 285f9ff9-2d6d-409c-b214-74992c8d7e7d ┆ POLYGON((-6.4311579 54.8300247,-6.4313… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ abedfc7c-e5fd-4a29-931e-da77b610d02d ┆ POLYGON((-6.4321833 54.8294427,-6.4322… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ a203a2c6-e130-4979-a7d5-8a059c6f31fd ┆ POLYGON((-6.4300627 54.829276,-6.43006… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 1d14caf6-b12d-486e-87dd-feef82fba9a7 ┆ POLYGON((-6.4301786 54.8281533,-6.4299… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 4b1e67cf-7355-439b-9a31-46a50f3ee227 ┆ POLYGON((-6.4298614 54.8278977,-6.4299… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 06de994e-efd4-4a1c-8a20-b4e883904cb2 ┆ POLYGON((-6.4296383 54.827599,-6.42956… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ ea0b2ea6-7c52-4395-9baa-bc023c7d3166 ┆ POLYGON((-6.4296844 54.8277379,-6.4296… ┆ … ┆ │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 49f022ef-5574-4613-ae54-af139666fde3 ┆ POLYGON((-6.4296843 54.8278169,-6.4296… ┆ … ┆ │ - └──────────────────────────────────────┴─────────────────────────────────────────┴───┴─────────────┘ - +We can preview the first few rows using `.show()`. Because this is a lazy operation and we've already cached the schema using `.to_view()`, this only takes a few seconds. ```python -df.to_view("buildings") +sd.view("divisions").show(5) ``` + ┌───────────────┬───────────────┬──────────────┬─────────┬───┬────────┬─────────────┬──────────────┐ + │ id ┆ geometry ┆ bbox ┆ country ┆ … ┆ region ┆ admin_level ┆ division_id │ + │ utf8 ┆ geometry ┆ struct ┆ utf8 ┆ ┆ utf8 ┆ int32 ┆ utf8 │ + ╞═══════════════╪═══════════════╪══════════════╪═════════╪═══╪════════╪═════════════╪══════════════╡ + │ a5c573c4-022… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 388a8056-ee… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ cf523f8c-c26… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 068ef37e-3b… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 8ace3d06-b8a… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 7238aeb3-b8… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ b26d2cba-b54… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 3c2dc8fc-79… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 20103725-17c… ┆ POLYGON((-49… ┆ {xmin: -49.… ┆ BR ┆ … ┆ BR-PR ┆ ┆ 45037e82-de… │ + └───────────────┴───────────────┴──────────────┴─────────┴───┴────────┴─────────────┴──────────────┘ -```python -# the buildings table is large and contains billions of rows -sd.sql(""" -SELECT - COUNT(*) -FROM - buildings -""").show() -``` - - ┌────────────┐ - │ count(*) │ - │ int64 │ - ╞════════════╡ - │ 2541497985 │ - └────────────┘ +The default view of the data hides some columns to ensure the entire output can be shown. To look at all the columns with type details, use `.schema`: ```python -# check out the schema of the buildings table to see what it contains -df.schema +sd.view("divisions").schema ``` - SedonaSchema with 24 fields: + SedonaSchema with 14 fields: id: utf8 geometry: geometry - bbox: struct + bbox: struct + country: utf8 version: int32 - sources: list - level: int32 + sources: list subtype: utf8 class: utf8 - height: float64 - names: struct - has_parts: boolean - is_underground: boolean - num_floors: int32 - num_floors_underground: int32 - min_height: float64 - min_floor: int32 - facade_color: utf8 - facade_material: utf8 - roof_material: utf8 - roof_shape: utf8 - roof_direction: float64 - roof_orientation: utf8 - roof_color: utf8 - roof_height: float64 + names: struct + is_land: boolean + is_territorial: boolean + region: utf8 + admin_level: int32 + division_id: utf8 + +Overture data makes heavy use of nested types. These can be indexed into or expanded using SQL: ```python -# find all the buildings in New York City that are taller than 20 meters -nyc_bbox_wkt = ( - "POLYGON((-74.2591 40.4774, -74.2591 40.9176, -73.7004 40.9176, " - "-73.7004 40.4774, -74.2591 40.4774))" -) -sd.sql(f""" -SELECT - id, - height, - num_floors, - roof_shape, - ST_Centroid(geometry) as centroid -FROM - buildings -WHERE - is_underground = FALSE - AND height IS NOT NULL - AND height > 20 - AND ST_Intersects( - geometry, - ST_GeomFromText('{nyc_bbox_wkt}', 4326) - ) -LIMIT 5; -""").show() +sd.sql( + "SELECT names.primary AS name, geometry FROM divisions WHERE region = 'CA-NS'" +).show(5) ``` - ┌─────────────────────────┬────────────────────┬────────────┬────────────┬─────────────────────────┐ - │ id ┆ height ┆ num_floors ┆ roof_shape ┆ centroid │ - │ utf8 ┆ float64 ┆ int32 ┆ utf8 ┆ geometry │ - ╞═════════════════════════╪════════════════════╪════════════╪════════════╪═════════════════════════╡ - │ aa8e3a73-c72c-4f1a-b6e… ┆ 20.38205909729004 ┆ ┆ ┆ POINT(-74.187673580307… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ efe7616b-7f7e-464c-9ce… ┆ 26.18361473083496 ┆ ┆ ┆ POINT(-74.189040982134… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ b3f734a1-325b-4e8c-b1d… ┆ 27.025876998901367 ┆ ┆ ┆ POINT(-74.2558161 40.8… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 45d88655-e2f4-4a08-926… ┆ 25.485210418701172 ┆ ┆ ┆ POINT(-74.182252194444… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 31e8353c-7d5b-4b20-94e… ┆ 21.294815063476562 ┆ ┆ ┆ POINT(-74.197113787905… │ - └─────────────────────────┴────────────────────┴────────────┴────────────┴─────────────────────────┘ + ┌────────────────────────────────────┬─────────────────────────────────────────────────────────────┐ + │ name ┆ geometry │ + │ utf8 ┆ geometry │ + ╞════════════════════════════════════╪═════════════════════════════════════════════════════════════╡ + │ Sable Island National Park Reserve ┆ POLYGON((-60.178333 43.9824655,-60.1785682 43.9825425,-60.… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Sable Island ┆ POLYGON((-59.7744732 44.2254616,-59.7928902 44.2173253,-59… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Halifax Regional Municipality ┆ MULTIPOLYGON(((-59.7321078 44.2390248,-59.7502166 44.23385… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ West Liscomb ┆ POLYGON((-62.0615594 45.0023306,-62.0621839 45.0024475,-62… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Marie Joseph ┆ POLYGON((-61.9911914 44.95646,-61.9912383 44.9579526,-61.9… │ + └────────────────────────────────────┴─────────────────────────────────────────────────────────────┘ -## Overture divisions table +Like all remote tables, it is worth resolving a query into a concrete local table to avoid fetching unnecessary data on repeated queries. The `.to_memtable()` method can be used to resolve a remote table into memory (great for small results); `.to_parquet()` can be used to resolve a remote table to disk (great for medium to large results). ```python -df = sd.read_parquet( - "s3://overturemaps-us-west-2/release/2025-11-19.0/theme=divisions/type=division_area/" -) -``` - +sd.sql( + "SELECT names.primary AS name, geometry FROM divisions WHERE region = 'CA-NS'" +).to_memtable().to_view("divisions_ns") -```python -# inspect a few rows of the data -df.show(10) +sd.view("divisions_ns").show(5) ``` - ┌─────────────────┬────────────────┬────────────────┬───┬────────────────┬────────┬────────────────┐ - │ id ┆ geometry ┆ bbox ┆ … ┆ is_territorial ┆ region ┆ division_id │ - │ utf8 ┆ geometry ┆ struct ┆ ┆ boolean ┆ utf8 ┆ utf8 │ - ╞═════════════════╪════════════════╪════════════════╪═══╪════════════════╪════════╪════════════════╡ - │ 3665c36d-d3a9-… ┆ POLYGON((12.5… ┆ {xmin: 12.455… ┆ … ┆ true ┆ IT-34 ┆ f05aa29f-151f… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 18a69439-a1da-… ┆ POLYGON((12.5… ┆ {xmin: 12.596… ┆ … ┆ true ┆ IT-36 ┆ ae00d58c-6e67… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 7d0f6d37-bb55-… ┆ POLYGON((12.6… ┆ {xmin: 12.567… ┆ … ┆ true ┆ IT-36 ┆ bdfc82ca-5f23… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 3f480ff6-6361-… ┆ POLYGON((12.5… ┆ {xmin: 12.549… ┆ … ┆ true ┆ IT-36 ┆ 1c750104-4470… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 31c3ab5e-eb6f-… ┆ POLYGON((12.6… ┆ {xmin: 12.612… ┆ … ┆ true ┆ IT-34 ┆ d90804ee-19a4… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 308517e6-64b4-… ┆ POLYGON((12.5… ┆ {xmin: 12.589… ┆ … ┆ true ┆ IT-34 ┆ aabd71e9-4d98… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 646e5b1f-b76a-… ┆ POLYGON((12.5… ┆ {xmin: 12.485… ┆ … ┆ true ┆ IT-34 ┆ 502c1c4e-fc19… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ f2809a49-1082-… ┆ POLYGON((12.5… ┆ {xmin: 12.538… ┆ … ┆ true ┆ IT-34 ┆ 8b446eed-00ad… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 72b27245-c7fd-… ┆ POLYGON((12.5… ┆ {xmin: 12.501… ┆ … ┆ true ┆ IT-34 ┆ 1d535e1f-d19e… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 815855d9-05d0-… ┆ POLYGON((12.4… ┆ {xmin: 12.371… ┆ … ┆ true ┆ IT-34 ┆ 5aa91354-9e8c… │ - └─────────────────┴────────────────┴────────────────┴───┴────────────────┴────────┴────────────────┘ - + ┌────────────────────────────────────┬─────────────────────────────────────────────────────────────┐ + │ name ┆ geometry │ + │ utf8 ┆ geometry │ + ╞════════════════════════════════════╪═════════════════════════════════════════════════════════════╡ + │ Sable Island National Park Reserve ┆ POLYGON((-60.178333 43.9824655,-60.1785682 43.9825425,-60.… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Sable Island ┆ POLYGON((-59.7744732 44.2254616,-59.7928902 44.2173253,-59… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Halifax Regional Municipality ┆ MULTIPOLYGON(((-59.7321078 44.2390248,-59.7502166 44.23385… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ West Liscomb ┆ POLYGON((-62.0615594 45.0023306,-62.0621839 45.0024475,-62… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Marie Joseph ┆ POLYGON((-61.9911914 44.95646,-61.9912383 44.9579526,-61.9… │ + └────────────────────────────────────┴─────────────────────────────────────────────────────────────┘ -```python -df.to_view("division_area") -``` +Importantly, Overture data is distributed using GeoParquet 1.1, for which SedonaDB has built in support! This means that spatial queries (e.g., `ST_Intersects()`) tend to execute quickly against overture. In this case, the spatial query for Nova Scotia is ~5x faster than the text-based region query. ```python -sd.sql(""" -SELECT - COUNT(*) -FROM division_area -""").show() -``` - - ┌──────────┐ - │ count(*) │ - │ int64 │ - ╞══════════╡ - │ 1052542 │ - └──────────┘ +import shapely +ns_bbox_wkb = shapely.box(-66.5, 43.4, -59.8, 47.1).wkb +sd.sql( + """ + SELECT names.primary AS name, geometry + FROM divisions + WHERE ST_Contains(ST_GeomFromWKB($wkb, 4326), geometry) + """, + params={"wkb": ns_bbox_wkb}, +).to_memtable().to_view("divisions_ns", overwrite=True) -```python -df.schema +sd.view("divisions_ns").show(5) ``` + ┌───────────────────┬──────────────────────────────────────────────────────────────────────────────┐ + │ name ┆ geometry │ + │ utf8 ┆ geometry │ + ╞═══════════════════╪══════════════════════════════════════════════════════════════════════════════╡ + │ Maces Bay ┆ POLYGON((-66.4491254 45.1265729,-66.4577261 45.126933,-66.4591563 45.126991… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Gooseberry Island ┆ POLYGON((-66.2598821 45.1380421,-66.2599962 45.1381233,-66.2600591 45.13828… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Musquash Parish ┆ POLYGON((-66.4595418 45.2215004,-66.4595406 45.221468,-66.4595396 45.221391… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Dipper Harbour ┆ POLYGON((-66.3755086 45.118812,-66.4089711 45.1488327,-66.4284252 45.138119… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ Chance Harbour ┆ POLYGON((-66.4089711 45.1488327,-66.3755086 45.118812,-66.3541725 45.105991… │ + └───────────────────┴──────────────────────────────────────────────────────────────────────────────┘ +## Overture buildings table - SedonaSchema with 13 fields: - id: utf8 - geometry: geometry - bbox: struct - country: utf8 - version: int32 - sources: list - subtype: utf8 - class: utf8 - names: struct - is_land: boolean - is_territorial: boolean - region: utf8 - division_id: utf8 +The [Overture buildings table](https://docs.overturemaps.org/guides/buildings/) is one of the largest tables provided by the Overture Maps Foundation. The workflow is similar to the division table or any remote table; however, the buildings table presents several unique challenges. +First, the metadata size for all files in the buildings table is very large. SedonaDB caches remote metadata to avoid repeated download; however, the default cache size is too small. For repeated queries against the buildings table, ensure that the cache size is increased to at least 900 MB and/or `.to_view()` is used to cache the schema. The cache lives as long as the session...use `sd = sedona.db.connect()` or reset the cache size to a smaller value to clear the cache. +> Overture removes old releases. See [this page](https://docs.overturemaps.org/release-calendar/#current-release) to see the latest version number and replace the relevant portion of the URL below. ```python -# get all the divisions in Nova Scotia and save them in memory with to_memtable() -nova_scotia_bbox_wkt = ( - "POLYGON((-66.5 43.4, -66.5 47.1, -59.8 47.1, -59.8 43.4, -66.5 43.4))" -) -ns = sd.sql(f""" -SELECT - country, region, names, geometry -FROM division_area -WHERE - ST_Intersects( - geometry, - ST_GeomFromText('{nova_scotia_bbox_wkt}', 4326) - ) -""").to_memtable() +sd.sql("SET datafusion.runtime.metadata_cache_limit = '900M'").execute() + +sd.read_parquet( + "s3://overturemaps-us-west-2/release/2026-02-18.0/theme=buildings/type=building/", + options={"aws.skip_signature": True, "aws.region": "us-west-2"}, +).to_view("buildings") ``` +Like all SedonaDB DataFrames, viewing a schema or previewing the first few rows are lazy and usually fast unless a query contains large aggregations or joins. + ```python -ns.to_view("ns_divisions") +sd.view("buildings").show(5) ``` + ┌──────────────────────────────────────┬─────────────────────────────────────────┬───┬─────────────┐ + │ id ┆ geometry ┆ … ┆ roof_height │ + │ utf8 ┆ geometry ┆ ┆ float64 │ + ╞══════════════════════════════════════╪═════════════════════════════════════════╪═══╪═════════════╡ + │ ab23f7ee-4c05-4246-a016-8260ce58a916 ┆ POLYGON((-67.589523 -39.0908362,-67.58… ┆ … ┆ │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 58356258-2e80-48fc-aacf-d81fcf74074c ┆ POLYGON((-67.5896327 -39.0907868,-67.5… ┆ … ┆ │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ b50595a8-cddb-44dd-bdbf-7bbe1e858ae0 ┆ POLYGON((-67.5897117 -39.0908483,-67.5… ┆ … ┆ │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ cbabe2df-f49a-4e9f-9cbe-c527a4b3b9f1 ┆ POLYGON((-67.5898768 -39.0907073,-67.5… ┆ … ┆ │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ bcd6984b-8da4-4dfe-9212-be2b02a24b67 ┆ POLYGON((-67.5901879 -39.0908288,-67.5… ┆ … ┆ │ + └──────────────────────────────────────┴─────────────────────────────────────────┴───┴─────────────┘ + -```python -df = sd.sql(""" -SELECT UNNEST(names), geometry -FROM ns_divisions -WHERE region = 'CA-NS' -""") -``` +Some operations like `.count()` use summary statistics and execute quickly even for large remote tables: ```python -%%time -# this executes quickly because the Nova Scotia data was persisted in memory with `to_memtable()` -df.show(2) +sd.view("buildings").count() ``` - ┌────────────────────────┬────────────────────────┬────────────────────────┬───────────────────────┐ - │ __unnest_placeholder(n ┆ __unnest_placeholder(n ┆ __unnest_placeholder(n ┆ geometry │ - │ s_divisions.names).pr… ┆ s_divisions.names).co… ┆ s_divisions.names).ru… ┆ geometry │ - ╞════════════════════════╪════════════════════════╪════════════════════════╪═══════════════════════╡ - │ Apple River ┆ ┆ ┆ POLYGON((-64.7260681… │ - ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ Allen Hill ┆ ┆ ┆ POLYGON((-64.6956656… │ - └────────────────────────┴────────────────────────┴────────────────────────┴───────────────────────┘ - CPU times: user 1.25 ms, sys: 805 μs, total: 2.05 ms - Wall time: 1.42 ms - -## Visualize the results with lonboard -```python -import lonboard + 2541282557 -lonboard.viz(df) -``` +Overture buildings has a number of attributes on which we can filter. For long-running queries it may be convenient to cache a result locally using `.to_memtable()` or `.to_parquet()` before inspecting using other tools; however like all Overture tables it is optimized for spatial queries and these are usually not expensive for small areas. +For example, we can find all of the buildings in New York City taller than 20 meters: - Map(basemap_style= 20 + AND ST_Intersects( + geometry, + ST_GeomFromText($1, 4326) + ) + LIMIT 5; + """, + params=(nyc_bbox_wkt,), +).to_memtable().to_view("buildings_nyc") + +sd.view("buildings_nyc").show(5) +``` -![Lonboard NS](image/lonboard_ns.png) + ┌─────────────────────────┬────────────────────┬────────────┬────────────┬─────────────────────────┐ + │ id ┆ height ┆ num_floors ┆ roof_shape ┆ centroid │ + │ utf8 ┆ float64 ┆ int32 ┆ utf8 ┆ geometry │ + ╞═════════════════════════╪════════════════════╪════════════╪════════════╪═════════════════════════╡ + │ aa8e3a73-c72c-4f1a-b6e… ┆ 20.38205909729004 ┆ ┆ ┆ POINT(-74.187673580307… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ efe7616b-7f7e-464c-9ce… ┆ 26.18361473083496 ┆ ┆ ┆ POINT(-74.189040982134… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ b3f734a1-325b-4e8c-b1d… ┆ 27.025876998901367 ┆ ┆ ┆ POINT(-74.2558161 40.8… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 45d88655-e2f4-4a08-926… ┆ 25.485210418701172 ┆ ┆ ┆ POINT(-74.182252194444… │ + ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 31e8353c-7d5b-4b20-94e… ┆ 21.294815063476562 ┆ ┆ ┆ POINT(-74.197113787905… │ + └─────────────────────────┴────────────────────┴────────────┴────────────┴─────────────────────────┘ diff --git a/python/sedonadb/tests/test_udf.py b/python/sedonadb/tests/test_udf.py index 4159c96ff..cead6de1a 100644 --- a/python/sedonadb/tests/test_udf.py +++ b/python/sedonadb/tests/test_udf.py @@ -237,42 +237,3 @@ def __datafusion_scalar_udf__(self): con.sql("SELECT some_external_udf('abcd', 123) as col").to_pandas(), pd.DataFrame({"col": [b"abcd / 123"]}), ) - - -def test_udf_sedonadb_registry_function_to_datafusion(con): - datafusion = pytest.importorskip("datafusion") - udf_impl = udf.arrow_udf(pa.binary(), [udf.STRING, udf.NUMERIC])(some_udf) - - # Register with our session - con.register_udf(udf_impl) - - # Create a datafusion session, fetch our udf and register with the other session - datafusion_ctx = datafusion.SessionContext() - datafusion_ctx.register_udf( - datafusion.ScalarUDF.from_pycapsule(con._impl.scalar_udf("some_udf")) - ) - - # Can't quite use to_pandas() because there is a schema/batch nullability mismatch - batches = datafusion_ctx.sql("SELECT some_udf('abcd', 123) as col").collect() - assert len(batches) == 1 - pd.testing.assert_frame_equal( - batches[0].to_pandas(), - pd.DataFrame({"col": [b"abcd / 123"]}), - ) - - -def test_udf_sedonadb_to_datafusion(): - datafusion = pytest.importorskip("datafusion") - udf_impl = udf.arrow_udf(pa.binary(), [udf.STRING, udf.NUMERIC])(some_udf) - - # Create a datafusion session, register udf_impl directly - datafusion_ctx = datafusion.SessionContext() - datafusion_ctx.register_udf(datafusion.ScalarUDF.from_pycapsule(udf_impl)) - - # Can't quite use to_pandas() because there is a schema/batch nullability mismatch - batches = datafusion_ctx.sql("SELECT some_udf('abcd', 123) as col").collect() - assert len(batches) == 1 - pd.testing.assert_frame_equal( - batches[0].to_pandas(), - pd.DataFrame({"col": [b"abcd / 123"]}), - ) diff --git a/rust/sedona-geoparquet/src/file_opener.rs b/rust/sedona-geoparquet/src/file_opener.rs index 6ba04b823..c827f7766 100644 --- a/rust/sedona-geoparquet/src/file_opener.rs +++ b/rust/sedona-geoparquet/src/file_opener.rs @@ -27,6 +27,7 @@ use datafusion_common::{ exec_err, Result, }; use datafusion_datasource_parquet::metadata::DFParquetMetadata; +use datafusion_execution::cache::cache_manager::FileMetadataCache; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::metrics::{ ExecutionPlanMetricsSet, MetricBuilder, MetricType, MetricValue, PruningMetrics, @@ -111,6 +112,7 @@ pub(crate) struct GeoParquetFileOpener { pub enable_pruning: bool, pub metrics: GeoParquetFileOpenerMetrics, pub options: TableGeoParquetOptions, + pub metadata_cache: Option>, } impl FileOpener for GeoParquetFileOpener { @@ -121,6 +123,7 @@ impl FileOpener for GeoParquetFileOpener { let parquet_metadata = DFParquetMetadata::new(&self_clone.object_store, &file.object_meta) .with_metadata_size_hint(self_clone.metadata_size_hint) + .with_file_metadata_cache(self_clone.metadata_cache) .fetch_metadata() .await?; diff --git a/rust/sedona-geoparquet/src/format.rs b/rust/sedona-geoparquet/src/format.rs index 08a04c92c..6c91ccfca 100644 --- a/rust/sedona-geoparquet/src/format.rs +++ b/rust/sedona-geoparquet/src/format.rs @@ -40,6 +40,7 @@ use datafusion::{ use datafusion_catalog::{memory::DataSourceExec, Session}; use datafusion_common::{plan_err, GetExt, Result, Statistics}; use datafusion_datasource_parquet::metadata::DFParquetMetadata; +use datafusion_execution::cache::cache_manager::FileMetadataCache; use datafusion_physical_expr::{LexRequirement, PhysicalExpr}; use datafusion_physical_plan::{ filter_pushdown::FilterPushdownPropagation, metrics::ExecutionPlanMetricsSet, ExecutionPlan, @@ -47,7 +48,7 @@ use datafusion_physical_plan::{ use futures::{StreamExt, TryStreamExt}; use object_store::{ObjectMeta, ObjectStore}; -use sedona_common::sedona_internal_err; +use sedona_common::{sedona_internal_datafusion_err, sedona_internal_err}; use sedona_schema::extension_type::ExtensionType; @@ -197,16 +198,22 @@ impl FileFormat for GeoParquetFormat { let inner_schema_without_metadata = self.inner().infer_schema(state, store, objects).await?; + let file_metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); + // Collect metadata separately. We can in theory do our own schema // inference too to save an extra server request, but then we have to // copy more ParquetFormat code. It may be that caching at the object // store level is the way to go here. let metadatas: Vec<_> = futures::stream::iter(objects) - .map(|object| async move { - DFParquetMetadata::new(store.as_ref(), object) - .with_metadata_size_hint(self.inner().metadata_size_hint()) - .fetch_metadata() - .await + .map(|object| { + let metadata_cache = file_metadata_cache.clone(); + async move { + DFParquetMetadata::new(store.as_ref(), object) + .with_metadata_size_hint(self.inner().metadata_size_hint()) + .with_file_metadata_cache(Some(metadata_cache)) + .fetch_metadata() + .await + } }) .boxed() // Workaround https://github.com/rust-lang/rust/issues/64552 .buffered(state.config_options().execution.meta_fetch_concurrency) @@ -305,7 +312,7 @@ impl FileFormat for GeoParquetFormat { async fn create_physical_plan( &self, - _state: &dyn Session, + state: &dyn Session, config: FileScanConfig, ) -> Result> { // A copy of ParquetSource::create_physical_plan() that ensures the underlying @@ -316,12 +323,21 @@ impl FileFormat for GeoParquetFormat { metadata_size_hint = Some(metadata); } - let mut source = GeoParquetFileSource::new(self.options.clone()); + let mut source = config + .file_source() + .as_any() + .downcast_ref::() + .cloned() + .ok_or_else(|| sedona_internal_datafusion_err!("Expected GeoParquetFileSource"))?; + + source = source.with_options(self.options.clone()); if let Some(metadata_size_hint) = metadata_size_hint { source = source.with_metadata_size_hint(metadata_size_hint) } + let file_metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); + source.metadata_cache = Some(file_metadata_cache.clone()); let conf = FileScanConfigBuilder::from(config) .with_source(Arc::new(source)) .build(); @@ -371,6 +387,7 @@ pub struct GeoParquetFileSource { metadata_size_hint: Option, predicate: Option>, options: TableGeoParquetOptions, + metadata_cache: Option>, } impl GeoParquetFileSource { @@ -381,6 +398,14 @@ impl GeoParquetFileSource { metadata_size_hint: None, predicate: None, options, + metadata_cache: None, + } + } + + pub fn with_options(&self, options: TableGeoParquetOptions) -> Self { + Self { + options, + ..self.clone() } } @@ -431,6 +456,7 @@ impl GeoParquetFileSource { options: TableGeoParquetOptions::from( parquet_source.table_parquet_options().clone(), ), + metadata_cache: None, }) } else { sedona_internal_err!("GeoParquetFileSource constructed from non-ParquetSource") @@ -444,6 +470,7 @@ impl GeoParquetFileSource { metadata_size_hint: self.metadata_size_hint, predicate: Some(predicate), options: self.options.clone(), + metadata_cache: self.metadata_cache.clone(), } } @@ -469,6 +496,7 @@ impl GeoParquetFileSource { metadata_size_hint: self.metadata_size_hint, predicate: self.predicate.clone(), options: self.options.clone(), + metadata_cache: self.metadata_cache.clone(), } } @@ -479,6 +507,7 @@ impl GeoParquetFileSource { metadata_size_hint: Some(hint), predicate: self.predicate.clone(), options: self.options.clone(), + metadata_cache: self.metadata_cache.clone(), } } } @@ -504,11 +533,12 @@ impl FileSource for GeoParquetFileSource { metadata_size_hint: self.metadata_size_hint, predicate: self.predicate.clone(), file_schema: base_config.file_schema().clone(), - enable_pruning: self.inner.table_parquet_options().global.pruning, + enable_pruning: self.options.inner.global.pruning, // HACK: Since there is no public API to set inner's metrics, so we use // inner's metrics as the ExecutionPlan-global metrics metrics: GeoParquetFileOpenerMetrics::new(self.inner.metrics()), options: self.options.clone(), + metadata_cache: self.metadata_cache.clone(), }) } @@ -527,6 +557,7 @@ impl FileSource for GeoParquetFileSource { None, )?; updated_inner.options = self.options.clone(); + updated_inner.metadata_cache = self.metadata_cache.clone(); Ok(inner_result.with_updated_node(Arc::new(updated_inner))) } None => Ok(inner_result), @@ -544,6 +575,7 @@ impl FileSource for GeoParquetFileSource { self.predicate.clone(), ); source.options = self.options.clone(); + source.metadata_cache = self.metadata_cache.clone(); Arc::new(source) } @@ -554,6 +586,7 @@ impl FileSource for GeoParquetFileSource { self.predicate.clone(), ); source.options = self.options.clone(); + source.metadata_cache = self.metadata_cache.clone(); Arc::new(source) } @@ -564,6 +597,7 @@ impl FileSource for GeoParquetFileSource { self.predicate.clone(), ); source.options = self.options.clone(); + source.metadata_cache = self.metadata_cache.clone(); Arc::new(source) } @@ -574,6 +608,7 @@ impl FileSource for GeoParquetFileSource { self.predicate.clone(), ); source.options = self.options.clone(); + source.metadata_cache = self.metadata_cache.clone(); Arc::new(source) }