diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..6d6633b16 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,77 @@ + + +# Running Benchmarks + +## pytest-benchmark + +These benchmarks provide a convenient way to compare the results of running queries on sedona-db to other engines like DuckDB and postgis. + +### Setup + +Install pytest-benchmark: +```bash +pip install pytest-benchmark +``` + +### Running benchmarks + +The below commands assume your working directory is in `benchmarks`. + +```bash +cd benchmarks/ +``` + +To run a benchmark, simply run the corresponding test function. For example, to run the benchmarks for st_buffer, you can run + +```bash +pytest test_functions.py::TestBenchFunctions::test_st_buffer +``` + +Most of the time, you'll also want to group by `param:table` or `func` (function) by using the `--benchmark-group-by=param:table` flag. pytest-benchmark will highlight the "best" value in green (e.g fastest for median, lowest for stddev) and "worse" value in red for each column per each group. + +```bash +pytest --benchmark-group-by=param:table test_functions.py::TestBenchFunctions::test_st_buffer +``` + +You can also reduce the number of columns that display by using the `--benchmark-columns` flag. + +```bash +pytest --benchmark-group-by=param:table --benchmark-columns=median,mean,stddev test_functions.py::TestBenchFunctions::test_st_buffer +``` + +Example output of the last command: + +``` +----------------------------- benchmark 'table=collections_complex': 3 tests ----------------------------- +Name (time in ms) Median Mean StdDev +---------------------------------------------------------------------------------------------------------- +test_st_buffer[collections_complex-SedonaDB] 87.0095 (1.0) 87.7874 (1.0) 3.7269 (1.0) +test_st_buffer[collections_complex-DuckDB] 440.4810 (5.06) 444.6948 (5.07) 12.1143 (3.25) +test_st_buffer[collections_complex-PostGIS] 864.5841 (9.94) 883.3661 (10.06) 50.4996 (13.55) +---------------------------------------------------------------------------------------------------------- + +---------------------------- benchmark 'table=collections_simple': 3 tests ----------------------------- +Name (time in ms) Median Mean StdDev +-------------------------------------------------------------------------------------------------------- +test_st_buffer[collections_simple-SedonaDB] 85.8510 (1.0) 86.5050 (1.0) 3.8481 (1.0) +test_st_buffer[collections_simple-DuckDB] 442.6664 (5.16) 444.5187 (5.14) 5.6186 (1.46) +test_st_buffer[collections_simple-PostGIS] 855.3329 (9.96) 854.7194 (9.88) 7.6190 (1.98) +-------------------------------------------------------------------------------------------------------- +``` + +For more details and command line options, refer to the official [pytest-benchmark documentation](https://pytest-benchmark.readthedocs.io/en/latest/usage.html) diff --git a/benchmarks/test_bench_base.py b/benchmarks/test_bench_base.py new file mode 100644 index 000000000..8e62f35c7 --- /dev/null +++ b/benchmarks/test_bench_base.py @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import json +from sedonadb.testing import DuckDB, PostGIS, SedonaDB + + +class TestBenchBase: + def setup_class(self): + self.sedonadb = SedonaDB.create_or_skip() + self.postgis = PostGIS.create_or_skip() + self.duckdb = DuckDB.create_or_skip() + + num_geoms = 100_000 + + # Setup tables + for name, options in [ + ( + "segments_large", + { + "geom_type": "LineString", + "target_rows": num_geoms, + "vertices_per_linestring_range": [2, 2], + }, + ), + ( + "polygons_simple", + { + "geom_type": "Polygon", + "target_rows": num_geoms, + "vertices_per_linestring_range": [10, 10], + }, + ), + ( + "polygons_complex", + { + "geom_type": "Polygon", + "target_rows": num_geoms, + "vertices_per_linestring_range": [500, 500], + }, + ), + ( + "collections_simple", + { + "geom_type": "GeometryCollection", + "target_rows": num_geoms, + "vertices_per_linestring_range": [10, 10], + }, + ), + ( + "collections_complex", + { + "geom_type": "GeometryCollection", + "target_rows": num_geoms, + "vertices_per_linestring_range": [500, 500], + }, + ), + ]: + # Generate synthetic data + query = f""" + SELECT + geometry as geom1, + geometry as geom2, + round(random() * 100) as integer + FROM sd_random_geometry('{json.dumps(options)}') + """ + tab = self.sedonadb.execute_and_collect(query) + + self.sedonadb.create_table_arrow(name, tab) + self.postgis.create_table_arrow(name, tab) + self.duckdb.create_table_arrow(name, tab) + + def _get_eng(self, eng): + if eng == SedonaDB: + return self.sedonadb + elif eng == PostGIS: + return self.postgis + elif eng == DuckDB: + return self.duckdb + else: + raise ValueError(f"Unsupported engine: {eng}") diff --git a/benchmarks/test_distance.py b/benchmarks/test_distance.py new file mode 100644 index 000000000..5744c0110 --- /dev/null +++ b/benchmarks/test_distance.py @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest +from test_bench_base import TestBenchBase +from sedonadb.testing import DuckDB, PostGIS, SedonaDB + + +class TestBenchPredicates(TestBenchBase): + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "polygons_simple", + "polygons_complex", + ], + ) + def test_st_distance(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Distance(geom1, geom2) from {table}") + + benchmark(queries) diff --git a/benchmarks/test_functions.py b/benchmarks/test_functions.py new file mode 100644 index 000000000..8be56d7de --- /dev/null +++ b/benchmarks/test_functions.py @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest +from test_bench_base import TestBenchBase +from sedonadb.testing import DuckDB, PostGIS, SedonaDB + + +class TestBenchFunctions(TestBenchBase): + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "polygons_simple", + "polygons_complex", + ], + ) + def test_st_area(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Area(geom1) from {table}") + + benchmark(queries) + + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "collections_simple", + "collections_complex", + ], + ) + def test_st_buffer(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Buffer(geom1, 2.0) from {table}") + + benchmark(queries) + + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "polygons_simple", + "polygons_complex", + ], + ) + def test_st_centroid(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Centroid(geom1) from {table}") + + benchmark(queries) + + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "collections_simple", + "collections_complex", + ], + ) + def test_st_dimension(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Dimension(geom1) from {table}") + + benchmark(queries) + + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "collections_simple", + "collections_complex", + ], + ) + def test_st_envelope(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Envelope(geom1) from {table}") + + benchmark(queries) + + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "collections_simple", + "collections_complex", + ], + ) + def test_st_geometrytype(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_GeometryType(geom1) from {table}") + + benchmark(queries) diff --git a/benchmarks/test_overlay.py b/benchmarks/test_overlay.py new file mode 100644 index 000000000..cc8a513e9 --- /dev/null +++ b/benchmarks/test_overlay.py @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest +from test_bench_base import TestBenchBase +from sedonadb.testing import DuckDB, PostGIS, SedonaDB + + +class TestBenchPredicates(TestBenchBase): + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "polygons_simple", + "polygons_complex", + ], + ) + def test_st_difference(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Difference(geom1, geom2) from {table}") + + benchmark(queries) diff --git a/benchmarks/test_predicates.py b/benchmarks/test_predicates.py new file mode 100644 index 000000000..cd8b8712e --- /dev/null +++ b/benchmarks/test_predicates.py @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest +from test_bench_base import TestBenchBase +from sedonadb.testing import DuckDB, PostGIS, SedonaDB + + +class TestBenchPredicates(TestBenchBase): + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "polygons_simple", + "polygons_complex", + ], + ) + def test_st_contains(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Contains(geom1, geom2) from {table}") + + benchmark(queries) + + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB]) + @pytest.mark.parametrize( + "table", + [ + "polygons_simple", + "polygons_complex", + ], + ) + def test_st_dwithin(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect( + f"SELECT ST_DWithin(geom1, geom2, 1.0) from {table}" + ) + + benchmark(queries)