From 3855e71824dcc49ae386c18192c9ec9c5a6ed17f Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Dec 2025 23:26:16 +0000 Subject: [PATCH 1/5] fix: Use reset_val for lock_timeout in settings metric pgwatch sets lock_timeout to 100ms during metric collection, which causes the monitoring to report 100ms instead of the actual configured value. This fix queries reset_val from pg_settings for lock_timeout to get the genuine configuration value. Closes #61 --- config/pgwatch-prometheus/metrics.yml | 6 +- tests/settings/__init__.py | 0 tests/settings/test_settings_metric.py | 168 +++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 tests/settings/__init__.py create mode 100644 tests/settings/test_settings_metric.py diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 4bc45fd..66ec2b7 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -362,17 +362,19 @@ metrics: This metric collects various PostgreSQL server settings and configurations. It provides insights into the server's configuration, including version, memory settings, and other important parameters. This metric is useful for monitoring server settings and ensuring optimal performance. + Note: For lock_timeout, we use reset_val instead of setting because pgwatch sets + lock_timeout to 100ms during metric collection, which would mask the actual configured value. sqls: 11: |- select /* pgwatch_generated */ (extract(epoch from now()) * 1e9)::int8 as epoch_ns, current_database() as tag_datname, name as tag_setting_name, - setting as tag_setting_value, + case when name = 'lock_timeout' then reset_val else setting end as tag_setting_value, unit as tag_unit, category as tag_category, vartype as tag_vartype, - case when setting ~ '^-?[0-9]+$' then setting::bigint else null end as numeric_value, + case when (case when name = 'lock_timeout' then reset_val else setting end) ~ '^-?[0-9]+$' then (case when name = 'lock_timeout' then reset_val else setting end)::bigint else null end as numeric_value, case when source <> 'default' then 0 else 1 end as is_default, 1 as configured from pg_settings diff --git a/tests/settings/__init__.py b/tests/settings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/settings/test_settings_metric.py b/tests/settings/test_settings_metric.py new file mode 100644 index 0000000..df16126 --- /dev/null +++ b/tests/settings/test_settings_metric.py @@ -0,0 +1,168 @@ +""" +Tests for the settings metric, particularly the lock_timeout fix. + +This module tests that the settings metric correctly uses reset_val for lock_timeout +instead of the session-level setting value, which would be affected by pgwatch's +lock_timeout override during metric collection. + +See: https://gitlab.com/postgres-ai/postgres_ai/-/issues/61 +""" + +import os +from pathlib import Path + +import pytest +import yaml + + +def get_metrics_yml_path() -> Path: + """Get the path to the metrics.yml file.""" + repo_root = Path(__file__).parent.parent.parent + return repo_root / "config" / "pgwatch-prometheus" / "metrics.yml" + + +@pytest.fixture(name="metrics_config") +def fixture_metrics_config() -> dict: + """Load the metrics.yml configuration.""" + metrics_path = get_metrics_yml_path() + with open(metrics_path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) + + +@pytest.mark.unit +def test_settings_metric_exists(metrics_config: dict) -> None: + """Test that the settings metric is defined in metrics.yml.""" + assert "metrics" in metrics_config, "metrics key should exist" + assert "settings" in metrics_config["metrics"], "settings metric should be defined" + assert "sqls" in metrics_config["metrics"]["settings"], "settings metric should have sqls" + assert 11 in metrics_config["metrics"]["settings"]["sqls"], "settings metric should have SQL for version 11+" + + +@pytest.mark.unit +def test_settings_metric_uses_reset_val_for_lock_timeout(metrics_config: dict) -> None: + """ + Test that the settings metric uses reset_val for lock_timeout. + + This is critical because pgwatch sets lock_timeout to 100ms during metric + collection, which would mask the actual configured value if we used 'setting'. + + See: https://gitlab.com/postgres-ai/postgres_ai/-/issues/61 + """ + sql = metrics_config["metrics"]["settings"]["sqls"][11] + + # Verify the SQL uses reset_val for lock_timeout + assert "reset_val" in sql, "SQL should reference reset_val column" + assert "lock_timeout" in sql, "SQL should reference lock_timeout" + + # More specific check: ensure the CASE statement is used correctly + assert "case when name = 'lock_timeout' then reset_val else setting end" in sql, \ + "SQL should use CASE statement to select reset_val for lock_timeout" + + +@pytest.mark.unit +def test_settings_metric_description_documents_lock_timeout_fix(metrics_config: dict) -> None: + """Test that the description documents the lock_timeout fix.""" + description = metrics_config["metrics"]["settings"]["description"] + assert "lock_timeout" in description.lower(), \ + "Description should mention lock_timeout" + assert "reset_val" in description.lower(), \ + "Description should mention reset_val" + + +@pytest.mark.unit +def test_settings_metric_has_required_fields(metrics_config: dict) -> None: + """Test that the settings metric SQL includes all required fields.""" + sql = metrics_config["metrics"]["settings"]["sqls"][11] + + required_fields = [ + "epoch_ns", + "tag_datname", + "tag_setting_name", + "tag_setting_value", + "tag_unit", + "tag_category", + "tag_vartype", + "numeric_value", + "is_default", + "configured", + ] + + for field in required_fields: + assert field in sql, f"SQL should include {field}" + + +@pytest.mark.unit +def test_settings_metric_numeric_value_uses_correct_source_for_lock_timeout(metrics_config: dict) -> None: + """ + Test that numeric_value also uses reset_val for lock_timeout. + + The numeric_value field should be derived from the same source as tag_setting_value + to ensure consistency. + """ + sql = metrics_config["metrics"]["settings"]["sqls"][11] + + # Count occurrences of the CASE expression for lock_timeout + # It should appear twice: once for tag_setting_value and once for numeric_value + case_expr = "case when name = 'lock_timeout' then reset_val else setting end" + occurrences = sql.count(case_expr) + + assert occurrences >= 2, \ + f"CASE expression for lock_timeout should appear at least twice (for tag_setting_value and numeric_value), found {occurrences}" + + +@pytest.mark.integration +@pytest.mark.requires_postgres +def test_settings_metric_lock_timeout_returns_actual_value() -> None: + """ + Integration test: verify lock_timeout returns the actual configured value. + + This test requires a running PostgreSQL database and tests that even when + lock_timeout is set to a different value in the session, our query returns + the actual configured (reset) value. + """ + try: + import psycopg + except ImportError: + pytest.skip("psycopg not available") + + target_db_url = os.getenv( + "TARGET_DB_URL", + "postgresql://postgres:postgres@localhost:55432/target_database" + ) + + try: + conn = psycopg.connect(target_db_url) + except Exception as e: + pytest.skip(f"Could not connect to PostgreSQL: {e}") + + try: + with conn.cursor() as cur: + # First, get the actual configured lock_timeout + cur.execute("SELECT reset_val FROM pg_settings WHERE name = 'lock_timeout'") + actual_value = cur.fetchone()[0] + + # Now, simulate what pgwatch does - set a session-level lock_timeout + cur.execute("SET lock_timeout = '100ms'") + + # Query using our fixed SQL logic + cur.execute(""" + SELECT + name, + setting as raw_setting, + reset_val, + CASE WHEN name = 'lock_timeout' THEN reset_val ELSE setting END as our_value + FROM pg_settings + WHERE name = 'lock_timeout' + """) + row = cur.fetchone() + name, raw_setting, reset_val, our_value = row + + # Verify: raw_setting should be 100 (the session override) + # but our_value should be the actual configured value + assert raw_setting == "100", f"Session setting should be 100ms, got {raw_setting}" + assert our_value == actual_value, \ + f"Our query should return reset_val ({actual_value}), not session value ({raw_setting})" + assert our_value == reset_val, "our_value should equal reset_val" + + finally: + conn.close() From 78930640d86e1c74dd4897bf0fb829f63ef7cccf Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Dec 2025 23:41:27 +0000 Subject: [PATCH 2/5] docs: Add SQL comment explaining reset_val usage for lock_timeout --- config/pgwatch-prometheus/metrics.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 66ec2b7..8657fba 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -370,6 +370,8 @@ metrics: (extract(epoch from now()) * 1e9)::int8 as epoch_ns, current_database() as tag_datname, name as tag_setting_name, + -- Use reset_val for lock_timeout because pgwatch sets it to 100ms during collection, + -- which would mask the actual configured value. reset_val shows the true server config. case when name = 'lock_timeout' then reset_val else setting end as tag_setting_value, unit as tag_unit, category as tag_category, From c7c2b278cf09a200d86625988e29dbe5c0f07141 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Dec 2025 05:32:16 +0000 Subject: [PATCH 3/5] fix: Use boot_val for is_default check on lock_timeout When pgwatch sets lock_timeout during collection, source becomes 'session', causing is_default to incorrectly report non-default. Now we compare reset_val with boot_val to determine the true default status for lock_timeout. --- config/pgwatch-prometheus/metrics.yml | 3 ++- tests/settings/test_settings_metric.py | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 8657fba..514a198 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -377,7 +377,8 @@ metrics: category as tag_category, vartype as tag_vartype, case when (case when name = 'lock_timeout' then reset_val else setting end) ~ '^-?[0-9]+$' then (case when name = 'lock_timeout' then reset_val else setting end)::bigint else null end as numeric_value, - case when source <> 'default' then 0 else 1 end as is_default, + -- For lock_timeout, compare reset_val with boot_val since source becomes 'session' during collection + case when name = 'lock_timeout' then (case when reset_val = boot_val then 1 else 0 end) else (case when source <> 'default' then 0 else 1 end) end as is_default, 1 as configured from pg_settings gauges: diff --git a/tests/settings/test_settings_metric.py b/tests/settings/test_settings_metric.py index df16126..a2e0023 100644 --- a/tests/settings/test_settings_metric.py +++ b/tests/settings/test_settings_metric.py @@ -110,6 +110,27 @@ def test_settings_metric_numeric_value_uses_correct_source_for_lock_timeout(metr f"CASE expression for lock_timeout should appear at least twice (for tag_setting_value and numeric_value), found {occurrences}" +@pytest.mark.unit +def test_settings_metric_is_default_uses_boot_val_for_lock_timeout(metrics_config: dict) -> None: + """ + Test that is_default compares reset_val with boot_val for lock_timeout. + + When pgwatch sets lock_timeout during collection, the source becomes 'session', + which would incorrectly report is_default=0. Instead, we should compare + reset_val with boot_val to determine if the configured value is the default. + + See: https://gitlab.com/postgres-ai/postgres_ai/-/issues/61 + """ + sql = metrics_config["metrics"]["settings"]["sqls"][11] + + # Verify is_default uses boot_val comparison for lock_timeout + assert "boot_val" in sql, "SQL should reference boot_val column for is_default comparison" + + # Check for the specific pattern that handles lock_timeout specially + assert "case when name = 'lock_timeout' then" in sql and "boot_val" in sql, \ + "SQL should use special handling for lock_timeout in is_default calculation" + + @pytest.mark.integration @pytest.mark.requires_postgres def test_settings_metric_lock_timeout_returns_actual_value() -> None: From fb693bfef429f0eed2d1b55a92f35cd57a0585db Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Dec 2025 05:42:31 +0000 Subject: [PATCH 4/5] fix: Also use reset_val for statement_timeout in settings metric pgwatch sets statement_timeout per-metric during collection, which would mask the actual configured value. Apply the same reset_val fix as lock_timeout. --- config/pgwatch-prometheus/metrics.yml | 16 ++++----- tests/settings/test_settings_metric.py | 48 ++++++++++++++------------ 2 files changed, 34 insertions(+), 30 deletions(-) diff --git a/config/pgwatch-prometheus/metrics.yml b/config/pgwatch-prometheus/metrics.yml index 514a198..4e0090d 100644 --- a/config/pgwatch-prometheus/metrics.yml +++ b/config/pgwatch-prometheus/metrics.yml @@ -362,23 +362,23 @@ metrics: This metric collects various PostgreSQL server settings and configurations. It provides insights into the server's configuration, including version, memory settings, and other important parameters. This metric is useful for monitoring server settings and ensuring optimal performance. - Note: For lock_timeout, we use reset_val instead of setting because pgwatch sets - lock_timeout to 100ms during metric collection, which would mask the actual configured value. + Note: For lock_timeout and statement_timeout, we use reset_val instead of setting because + pgwatch overrides these during metric collection, which would mask the actual configured values. sqls: 11: |- select /* pgwatch_generated */ (extract(epoch from now()) * 1e9)::int8 as epoch_ns, current_database() as tag_datname, name as tag_setting_name, - -- Use reset_val for lock_timeout because pgwatch sets it to 100ms during collection, - -- which would mask the actual configured value. reset_val shows the true server config. - case when name = 'lock_timeout' then reset_val else setting end as tag_setting_value, + -- Use reset_val for lock_timeout/statement_timeout because pgwatch overrides them during + -- collection (lock_timeout=100ms, statement_timeout per-metric), masking actual config. + case when name in ('lock_timeout', 'statement_timeout') then reset_val else setting end as tag_setting_value, unit as tag_unit, category as tag_category, vartype as tag_vartype, - case when (case when name = 'lock_timeout' then reset_val else setting end) ~ '^-?[0-9]+$' then (case when name = 'lock_timeout' then reset_val else setting end)::bigint else null end as numeric_value, - -- For lock_timeout, compare reset_val with boot_val since source becomes 'session' during collection - case when name = 'lock_timeout' then (case when reset_val = boot_val then 1 else 0 end) else (case when source <> 'default' then 0 else 1 end) end as is_default, + case when (case when name in ('lock_timeout', 'statement_timeout') then reset_val else setting end) ~ '^-?[0-9]+$' then (case when name in ('lock_timeout', 'statement_timeout') then reset_val else setting end)::bigint else null end as numeric_value, + -- For these settings, compare reset_val with boot_val since source becomes 'session' during collection + case when name in ('lock_timeout', 'statement_timeout') then (case when reset_val = boot_val then 1 else 0 end) else (case when source <> 'default' then 0 else 1 end) end as is_default, 1 as configured from pg_settings gauges: diff --git a/tests/settings/test_settings_metric.py b/tests/settings/test_settings_metric.py index a2e0023..8780847 100644 --- a/tests/settings/test_settings_metric.py +++ b/tests/settings/test_settings_metric.py @@ -39,24 +39,28 @@ def test_settings_metric_exists(metrics_config: dict) -> None: @pytest.mark.unit -def test_settings_metric_uses_reset_val_for_lock_timeout(metrics_config: dict) -> None: +def test_settings_metric_uses_reset_val_for_overridden_settings(metrics_config: dict) -> None: """ - Test that the settings metric uses reset_val for lock_timeout. + Test that the settings metric uses reset_val for settings overridden by pgwatch. - This is critical because pgwatch sets lock_timeout to 100ms during metric - collection, which would mask the actual configured value if we used 'setting'. + This is critical because pgwatch overrides these during metric collection: + - lock_timeout: set to 100ms + - statement_timeout: set per-metric (e.g., 15s for settings) + + Using 'setting' would mask the actual configured values. See: https://gitlab.com/postgres-ai/postgres_ai/-/issues/61 """ sql = metrics_config["metrics"]["settings"]["sqls"][11] - # Verify the SQL uses reset_val for lock_timeout + # Verify the SQL uses reset_val for overridden settings assert "reset_val" in sql, "SQL should reference reset_val column" assert "lock_timeout" in sql, "SQL should reference lock_timeout" + assert "statement_timeout" in sql, "SQL should reference statement_timeout" - # More specific check: ensure the CASE statement is used correctly - assert "case when name = 'lock_timeout' then reset_val else setting end" in sql, \ - "SQL should use CASE statement to select reset_val for lock_timeout" + # More specific check: ensure the CASE statement handles both settings + assert "name in ('lock_timeout', 'statement_timeout')" in sql, \ + "SQL should use IN clause to handle both lock_timeout and statement_timeout" @pytest.mark.unit @@ -92,43 +96,43 @@ def test_settings_metric_has_required_fields(metrics_config: dict) -> None: @pytest.mark.unit -def test_settings_metric_numeric_value_uses_correct_source_for_lock_timeout(metrics_config: dict) -> None: +def test_settings_metric_numeric_value_uses_correct_source_for_overridden_settings(metrics_config: dict) -> None: """ - Test that numeric_value also uses reset_val for lock_timeout. + Test that numeric_value also uses reset_val for overridden settings. The numeric_value field should be derived from the same source as tag_setting_value to ensure consistency. """ sql = metrics_config["metrics"]["settings"]["sqls"][11] - # Count occurrences of the CASE expression for lock_timeout + # Count occurrences of the CASE expression for overridden settings # It should appear twice: once for tag_setting_value and once for numeric_value - case_expr = "case when name = 'lock_timeout' then reset_val else setting end" + case_expr = "case when name in ('lock_timeout', 'statement_timeout') then reset_val else setting end" occurrences = sql.count(case_expr) assert occurrences >= 2, \ - f"CASE expression for lock_timeout should appear at least twice (for tag_setting_value and numeric_value), found {occurrences}" + f"CASE expression for overridden settings should appear at least twice (for tag_setting_value and numeric_value), found {occurrences}" @pytest.mark.unit -def test_settings_metric_is_default_uses_boot_val_for_lock_timeout(metrics_config: dict) -> None: +def test_settings_metric_is_default_uses_boot_val_for_overridden_settings(metrics_config: dict) -> None: """ - Test that is_default compares reset_val with boot_val for lock_timeout. + Test that is_default compares reset_val with boot_val for overridden settings. - When pgwatch sets lock_timeout during collection, the source becomes 'session', - which would incorrectly report is_default=0. Instead, we should compare - reset_val with boot_val to determine if the configured value is the default. + When pgwatch overrides lock_timeout/statement_timeout during collection, the source + becomes 'session', which would incorrectly report is_default=0. Instead, we should + compare reset_val with boot_val to determine if the configured value is the default. See: https://gitlab.com/postgres-ai/postgres_ai/-/issues/61 """ sql = metrics_config["metrics"]["settings"]["sqls"][11] - # Verify is_default uses boot_val comparison for lock_timeout + # Verify is_default uses boot_val comparison for overridden settings assert "boot_val" in sql, "SQL should reference boot_val column for is_default comparison" - # Check for the specific pattern that handles lock_timeout specially - assert "case when name = 'lock_timeout' then" in sql and "boot_val" in sql, \ - "SQL should use special handling for lock_timeout in is_default calculation" + # Check for the specific pattern that handles overridden settings specially + assert "name in ('lock_timeout', 'statement_timeout')" in sql and "boot_val" in sql, \ + "SQL should use special handling for overridden settings in is_default calculation" @pytest.mark.integration From 56091ea535f81f18393961a74899f36bdf899721 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Dec 2025 05:51:19 +0000 Subject: [PATCH 5/5] style: Use lowercase SQL keywords per style guide --- tests/settings/test_settings_metric.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/settings/test_settings_metric.py b/tests/settings/test_settings_metric.py index 8780847..5f23a87 100644 --- a/tests/settings/test_settings_metric.py +++ b/tests/settings/test_settings_metric.py @@ -163,21 +163,21 @@ def test_settings_metric_lock_timeout_returns_actual_value() -> None: try: with conn.cursor() as cur: # First, get the actual configured lock_timeout - cur.execute("SELECT reset_val FROM pg_settings WHERE name = 'lock_timeout'") + cur.execute("select reset_val from pg_settings where name = 'lock_timeout'") actual_value = cur.fetchone()[0] # Now, simulate what pgwatch does - set a session-level lock_timeout - cur.execute("SET lock_timeout = '100ms'") + cur.execute("set lock_timeout = '100ms'") # Query using our fixed SQL logic cur.execute(""" - SELECT + select name, setting as raw_setting, reset_val, - CASE WHEN name = 'lock_timeout' THEN reset_val ELSE setting END as our_value - FROM pg_settings - WHERE name = 'lock_timeout' + case when name = 'lock_timeout' then reset_val else setting end as our_value + from pg_settings + where name = 'lock_timeout' """) row = cur.fetchone() name, raw_setting, reset_val, our_value = row