From 09a6263db878d2919f1ca8e8eb335999c852154f Mon Sep 17 00:00:00 2001 From: Bruno Carrez Date: Tue, 5 Mar 2024 00:57:23 -0500 Subject: [PATCH 1/5] add test of logs and traces in test_nila_diskusage --- tests/functional/diskusage/test_mila_diskusage.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/functional/diskusage/test_mila_diskusage.py b/tests/functional/diskusage/test_mila_diskusage.py index 939c90a9..6c497c10 100644 --- a/tests/functional/diskusage/test_mila_diskusage.py +++ b/tests/functional/diskusage/test_mila_diskusage.py @@ -1,6 +1,8 @@ +import logging from pathlib import Path import pytest +from opentelemetry.trace import Status, StatusCode, get_tracer import sarc.storage.mila from sarc.ldap.api import Credentials, User @@ -62,9 +64,10 @@ def mock_get_users(): "test_config", [{"clusters": {"mila": {"host": "mila"}}}], indirect=True ) def test_mila_fetch_diskusage_single( - test_config, monkeypatch, cli_main, file_regression + test_config, monkeypatch, cli_main, file_regression, caplog, captrace ): count = 0 + caplog.set_level(logging.INFO) def mock_get_report(*args): nonlocal count @@ -93,6 +96,16 @@ def mock_get_report(*args): # report = sarc.storage.mila.fetch_diskusage_report(cluster=test_config.clusters["mila"]) file_regression.check(data[0].json(exclude={"id": True}, indent=4)) + # check logging + assert caplog.text.__contains__("Acquiring mila storages report...") + + # check traces + traces = captrace.get_finished_spans() + assert len(traces) == 1 # only one cluster was acquired + assert traces[0].name == "cluster" + assert traces[0].attributes["cluster_name"] == "mila" + assert traces[0].status.status_code == StatusCode.OK + @pytest.mark.freeze_time("2023-07-25") @pytest.mark.parametrize( From 7bd0466d81aa8101c56501aec4fdab0af8a99f04 Mon Sep 17 00:00:00 2001 From: Bruno Carrez Date: Tue, 5 Mar 2024 00:58:11 -0500 Subject: [PATCH 2/5] add logs and trace in acquire diskusage --- sarc/cli/acquire/storages.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/sarc/cli/acquire/storages.py b/sarc/cli/acquire/storages.py index 84df6cae..14ee4d6d 100644 --- a/sarc/cli/acquire/storages.py +++ b/sarc/cli/acquire/storages.py @@ -1,5 +1,7 @@ +import logging from dataclasses import dataclass +from opentelemetry.trace import Status, StatusCode, get_tracer from simple_parsing import field from sarc.config import config @@ -18,22 +20,29 @@ class AcquireStorages: cluster_names: list[str] = field(alias=["-c"], default_factory=list) dry: bool = False + tracer = get_tracer("AcquireStorages") + def execute(self) -> int: cfg = config() for cluster_name in self.cluster_names: - print(f"Acquiring {cluster_name} storages report...") + with self.tracer.start_as_current_span("cluster") as span: + span.set_attribute("cluster_name", cluster_name) + + logging.info(f"Acquiring {cluster_name} storages report...") + + cluster = cfg.clusters[cluster_name] - cluster = cfg.clusters[cluster_name] + fetch_diskusage = methods.get(cluster_name, methods["default"]) + du = fetch_diskusage(cluster) - fetch_diskusage = methods.get(cluster_name, methods["default"]) - du = fetch_diskusage(cluster) + if not self.dry: + collection = get_diskusage_collection() + collection.add(du) + else: + print("Document:") + print(du.json(indent=2)) - if not self.dry: - collection = get_diskusage_collection() - collection.add(du) - else: - print("Document:") - print(du.json(indent=2)) + span.set_status(Status(StatusCode.OK)) return 0 From c58821f7d85b4c1027caadb8f671cab0ef3b787d Mon Sep 17 00:00:00 2001 From: Bruno Carrez Date: Tue, 5 Mar 2024 16:19:46 -0500 Subject: [PATCH 3/5] add using_span context manager to handle traces and limit exceptions effet to discrete parts of code --- sarc/cli/acquire/storages.py | 9 +++------ sarc/traces.py | 20 +++++++++++++++++++ .../diskusage/test_mila_diskusage.py | 3 ++- 3 files changed, 25 insertions(+), 7 deletions(-) create mode 100644 sarc/traces.py diff --git a/sarc/cli/acquire/storages.py b/sarc/cli/acquire/storages.py index 14ee4d6d..f72d39e4 100644 --- a/sarc/cli/acquire/storages.py +++ b/sarc/cli/acquire/storages.py @@ -1,13 +1,13 @@ import logging from dataclasses import dataclass -from opentelemetry.trace import Status, StatusCode, get_tracer from simple_parsing import field from sarc.config import config from sarc.storage.diskusage import get_diskusage_collection from sarc.storage.drac import fetch_diskusage_report as fetch_dirac_diskusage from sarc.storage.mila import fetch_diskusage_report as fetch_mila_diskusage +from sarc.traces import using_trace methods = { "default": fetch_dirac_diskusage, @@ -20,16 +20,15 @@ class AcquireStorages: cluster_names: list[str] = field(alias=["-c"], default_factory=list) dry: bool = False - tracer = get_tracer("AcquireStorages") - def execute(self) -> int: cfg = config() for cluster_name in self.cluster_names: - with self.tracer.start_as_current_span("cluster") as span: + with using_trace("AcquireStorages", "cluster") as span: span.set_attribute("cluster_name", cluster_name) logging.info(f"Acquiring {cluster_name} storages report...") + span.add_event(f"Acquiring {cluster_name} storages report...") cluster = cfg.clusters[cluster_name] @@ -43,6 +42,4 @@ def execute(self) -> int: print("Document:") print(du.json(indent=2)) - span.set_status(Status(StatusCode.OK)) - return 0 diff --git a/sarc/traces.py b/sarc/traces.py new file mode 100644 index 00000000..46569ccc --- /dev/null +++ b/sarc/traces.py @@ -0,0 +1,20 @@ +from contextlib import contextmanager + +from opentelemetry.trace import Status, StatusCode, get_tracer + + +# context manager to manage traces & span, +# and catch exceptions (in span) without breaking the whole program execution +@contextmanager +def using_trace(tracer_name: str, span_name: str): + tracer = get_tracer(tracer_name) + with tracer.start_as_current_span(span_name) as span: + try: + yield span + span.set_status(Status(StatusCode.OK)) + except Exception as exc: + span.set_status(Status(StatusCode.ERROR)) + span.record_exception(exc) + finally: + # nothing to do, end or close... + pass diff --git a/tests/functional/diskusage/test_mila_diskusage.py b/tests/functional/diskusage/test_mila_diskusage.py index 6c497c10..08d49f2d 100644 --- a/tests/functional/diskusage/test_mila_diskusage.py +++ b/tests/functional/diskusage/test_mila_diskusage.py @@ -2,7 +2,7 @@ from pathlib import Path import pytest -from opentelemetry.trace import Status, StatusCode, get_tracer +from opentelemetry.trace import StatusCode import sarc.storage.mila from sarc.ldap.api import Credentials, User @@ -104,6 +104,7 @@ def mock_get_report(*args): assert len(traces) == 1 # only one cluster was acquired assert traces[0].name == "cluster" assert traces[0].attributes["cluster_name"] == "mila" + assert "Acquiring mila storages report..." in [e.name for e in traces[0].events] assert traces[0].status.status_code == StatusCode.OK From 8d02b0e1a4aa12ec3b25a0b34d813631432e79ba Mon Sep 17 00:00:00 2001 From: Bruno Carrez Date: Tue, 5 Mar 2024 16:33:00 -0500 Subject: [PATCH 4/5] lint --- sarc/cli/acquire/storages.py | 1 + sarc/traces.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/sarc/cli/acquire/storages.py b/sarc/cli/acquire/storages.py index f72d39e4..fefda5a9 100644 --- a/sarc/cli/acquire/storages.py +++ b/sarc/cli/acquire/storages.py @@ -27,6 +27,7 @@ def execute(self) -> int: with using_trace("AcquireStorages", "cluster") as span: span.set_attribute("cluster_name", cluster_name) + # pylint: disable=logging-fstring-interpolation logging.info(f"Acquiring {cluster_name} storages report...") span.add_event(f"Acquiring {cluster_name} storages report...") diff --git a/sarc/traces.py b/sarc/traces.py index 46569ccc..9da00561 100644 --- a/sarc/traces.py +++ b/sarc/traces.py @@ -12,7 +12,7 @@ def using_trace(tracer_name: str, span_name: str): try: yield span span.set_status(Status(StatusCode.OK)) - except Exception as exc: + except Exception as exc: # pylint: disable=broad-exception-caught span.set_status(Status(StatusCode.ERROR)) span.record_exception(exc) finally: From 659960f8910b621b489c904834a8fdcca6ba14c5 Mon Sep 17 00:00:00 2001 From: Bruno Carrez Date: Tue, 5 Mar 2024 17:59:51 -0500 Subject: [PATCH 5/5] ADD 3 unit tests for using_traces --- sarc/traces.py | 17 +++++- tests/unittests/test_using_trace.py | 86 +++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 tests/unittests/test_using_trace.py diff --git a/sarc/traces.py b/sarc/traces.py index 9da00561..744e5cff 100644 --- a/sarc/traces.py +++ b/sarc/traces.py @@ -6,7 +6,18 @@ # context manager to manage traces & span, # and catch exceptions (in span) without breaking the whole program execution @contextmanager -def using_trace(tracer_name: str, span_name: str): +def using_trace( + tracer_name: str, span_name: str, exception_types=[Exception] +): # pylint: disable=dangerous-default-value + """ + Context manager to manage traces & span, + and catch exceptions (in span) without breaking the whole program execution. + Parameters: + tracer_name (str): name of the tracer (can be shared between different spans) + span_name (str): name of the span + exception_types ([type,..]): types of exceptions to catch, other types will be raised + by default, `Exception` is caught so that nothing would be raised + """ tracer = get_tracer(tracer_name) with tracer.start_as_current_span(span_name) as span: try: @@ -15,6 +26,10 @@ def using_trace(tracer_name: str, span_name: str): except Exception as exc: # pylint: disable=broad-exception-caught span.set_status(Status(StatusCode.ERROR)) span.record_exception(exc) + + # re-raise the exception to be caught by the caller + if not any(isinstance(exc, t) for t in exception_types): + raise finally: # nothing to do, end or close... pass diff --git a/tests/unittests/test_using_trace.py b/tests/unittests/test_using_trace.py new file mode 100644 index 00000000..5878fae6 --- /dev/null +++ b/tests/unittests/test_using_trace.py @@ -0,0 +1,86 @@ +# test the traces helper context manager + +from opentelemetry.trace import Status, StatusCode, get_tracer + +from sarc.traces import using_trace + + +def test_using_trace_noerror(captrace): + with using_trace("test_using_trace_noerror", "span1") as span: + span.add_event("event1") + with using_trace("test_using_trace_noerror", "span2") as span: + span.add_event("event2.1") + span.add_event("event2.2") + + # check traces + traces = captrace.get_finished_spans() + assert len(traces) == 2 + assert traces[0].name == "span1" + assert len(traces[0].events) == 1 + assert "event1" in [e.name for e in traces[0].events] + assert traces[0].status.status_code == StatusCode.OK + assert traces[1].name == "span2" + assert len(traces[1].events) == 2 + assert "event2.1" in [e.name for e in traces[1].events] + assert "event2.2" in [e.name for e in traces[1].events] + assert traces[1].status.status_code == StatusCode.OK + + +def test_using_trace_default_exception(captrace): + with using_trace("test_using_trace_noerror", "span1") as span: + pass + with using_trace("test_using_trace_default_exception", "span2") as span: + span.add_event("event1") + raise Exception("test exception") + traces = captrace.get_finished_spans() + assert len(traces) == 2 + assert traces[0].status.status_code == StatusCode.OK + assert traces[1].status.status_code == StatusCode.ERROR + + +def test_using_trace_expected_exception(captrace): + exception_caught = False + try: + with using_trace("test_using_trace_noerror", "span1") as span: + pass + with using_trace( + "test_using_trace_default_exception", + "span2", + exception_types=[ZeroDivisionError], + ) as span: + span.add_event("event1") + a = 1 / 0 + except Exception: + exception_caught = True + + assert ( + exception_caught == False + ) # the exception should be handled by the context manager + traces = captrace.get_finished_spans() + assert len(traces) == 2 + assert traces[0].status.status_code == StatusCode.OK + assert traces[1].status.status_code == StatusCode.ERROR + + +def test_using_trace_unexpected_exception(captrace): + exception_caught = False + try: + with using_trace("test_using_trace_noerror", "span1") as span: + pass + with using_trace( + "test_using_trace_default_exception", + "span2", + exception_types=[ZeroDivisionError], + ) as span: + span.add_event("event1") + assert False + except Exception: + exception_caught = True + + assert ( + exception_caught == True + ) # the ExceptionError is not handled by the context manager + traces = captrace.get_finished_spans() + assert len(traces) == 2 + assert traces[0].status.status_code == StatusCode.OK + assert traces[1].status.status_code == StatusCode.ERROR