Skip to content

Sanitize Metric Name in Checkpoints #990

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions tests/utils/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,12 @@ def test_best_checkpoint_path(self) -> None:
best_path,
)

# apply sanitation
self.assertEqual(
get_best_checkpoint_path(temp_dir, "val/loss", "min"),
best_path,
)

# handle negative values
best_path_2 = os.path.join(temp_dir, "epoch_0_step_0_val_loss=-0.01")
os.mkdir(best_path_2)
Expand Down Expand Up @@ -1373,6 +1379,15 @@ def test_get_checkpoint_dirpaths(self) -> None:
{path1, path2, path3},
)

# with metric name sanitation
self.assertEqual(
{
str(x)
for x in get_checkpoint_dirpaths(temp_dir, metric_name="val/loss")
},
{path1, path2, path3},
)

with tempfile.TemporaryDirectory() as temp_dir:
self.assertEqual(
get_checkpoint_dirpaths(temp_dir),
Expand Down
29 changes: 25 additions & 4 deletions torchtnt/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,27 @@
@dataclass
class MetricData:
"""
Representation of a metric instance. Should provide both a metric name and it's value.
Representation of a metric instance. Should provide both a metric name and its value.

Note: The metric name is sanitized by replacing '/' with '_' to prevent potential issues
when using the name as a path or identifier.
"""

name: str
value: float

def __init__(self, name: str, value: float) -> None:
self.name = MetricData.sanitize_metric_name(name)
self.value = value

@classmethod
def sanitize_metric_name(cls, name: str) -> str:
"""
Sanitizes a metric name by replacing '/' with '_'.
This is done to prevent potential issues when using the name as a path or identifier.
"""
return name.replace("/", "_")


@dataclass
class BestCheckpointConfig:
Expand Down Expand Up @@ -481,9 +496,14 @@ def generate_checkpoint_path(
self._best_checkpoint_config
), "Attempted to get a checkpoint with metric but best checkpoint config is not set"

assert self._best_checkpoint_config.monitored_metric == metric_data.name, (
assert (
MetricData.sanitize_metric_name(
self._best_checkpoint_config.monitored_metric
)
== metric_data.name
), (
f"Attempted to get a checkpoint with metric '{metric_data.name}', "
f"but best checkpoint config is for '{none_throws(self._best_checkpoint_config).monitored_metric}'"
f"but best checkpoint config is for '{MetricData.sanitize_metric_name(none_throws(self._best_checkpoint_config).monitored_metric)}'"
)

checkpoint_path = CheckpointPath(
Expand Down Expand Up @@ -815,7 +835,8 @@ def _retrieve_checkpoint_dirpaths(

# If a metric was provided, keep only the checkpoints tracking it
if metric_name and not (
ckpt.metric_data and ckpt.metric_data.name == metric_name
ckpt.metric_data
and ckpt.metric_data.name == MetricData.sanitize_metric_name(metric_name)
):
continue

Expand Down