diff --git a/cfg/algo/ppo.yaml b/cfg/algo/ppo.yaml index a23c1e3..cdd3f1c 100644 --- a/cfg/algo/ppo.yaml +++ b/cfg/algo/ppo.yaml @@ -20,7 +20,7 @@ optimizer: lr: 5e-4 -num_channels: 32 -num_residual_blocks: 3 +num_channels: 64 +num_residual_blocks: 4 diff --git a/cfg/baseline/ppo.yaml b/cfg/baseline/ppo.yaml index 541fcaf..7ac6ec8 100644 --- a/cfg/baseline/ppo.yaml +++ b/cfg/baseline/ppo.yaml @@ -20,7 +20,7 @@ optimizer: lr: 3e-4 -num_channels: 32 -num_residual_blocks: 3 +num_channels: 64 +num_residual_blocks: 4 diff --git a/cfg/train_InRL.yaml b/cfg/train_InRL.yaml index 43a30e6..30bc74e 100644 --- a/cfg/train_InRL.yaml +++ b/cfg/train_InRL.yaml @@ -7,13 +7,13 @@ run_dir: augment: false -epochs: 1500 +epochs: 1000 rounds: 64 -save_interval: -1 +save_interval: 300 -black_checkpoint: pretrained_models/${board_size}_${board_size}/${algo.name}/0.pt -white_checkpoint: pretrained_models/${board_size}_${board_size}/${algo.name}/1.pt +black_checkpoint: black_final.pt # pretrained_models/${board_size}_${board_size}/${algo.name}/0.pt +white_checkpoint: white_final.pt # pretrained_models/${board_size}_${board_size}/${algo.name}/1.pt wandb: group: ${board_size}_${board_size}_${algo.name}_InRL diff --git a/gomoku_rl/utils/elo.py b/gomoku_rl/utils/elo.py index f653225..7207d45 100644 --- a/gomoku_rl/utils/elo.py +++ b/gomoku_rl/utils/elo.py @@ -19,3 +19,27 @@ def compute_elo_ratings(payoff: np.ndarray, average_rating: float = 1200) -> np. elo_ratings = payoff.mean(axis=-1) elo_ratings = elo_ratings * (400 / np.log(10)) + average_rating return elo_ratings + + +def compute_expected_score(rating_0: float, rating_1: float) -> float: + return 1 / (1 + 10.0 ** ((rating_1 - rating_0) / 400)) + + +class Elo: + def __init__(self) -> None: + self.players: dict[str, float] = {} + + def addPlayer(self, name: str, rating: float = 1200): + assert name not in self.players + self.players[name] = rating + + def expected_score(self, player_0: str, player_1: str) -> float: + rating_0 = self.players[player_0] + rating_1 = self.players[player_1] + return compute_expected_score(rating_0, rating_1) + + def update(self, player_0: str, player_1: str, score: float, K: float = 64): + e = self.expected_score(player_0, player_1) + tmp = K * (score - e) + self.players[player_0] = self.players[player_0] + tmp + self.players[player_1] = self.players[player_1] - tmp diff --git a/pretrained_models/15_15/ppo/0.pt b/pretrained_models/15_15/ppo/0.pt index da4fd9a..0e3b99f 100644 Binary files a/pretrained_models/15_15/ppo/0.pt and b/pretrained_models/15_15/ppo/0.pt differ diff --git a/pretrained_models/15_15/ppo/1.pt b/pretrained_models/15_15/ppo/1.pt index c5c64b2..e2c96f6 100644 Binary files a/pretrained_models/15_15/ppo/1.pt and b/pretrained_models/15_15/ppo/1.pt differ