Qm7x dataset (#216)

* qm7example rebased with updated capabiltiies from the main * qm7x example updated * qm7x example updated * coordinates added in data.x * black formatting of train.py * option to normalize energy by number of atoms added * qm7x example updated with correct variable name * energy_per_atoms boolean variable added * write only ADIOS format as default * qm7 dataset fixed * black formatting fixed --------- Co-authored-by: Massimiliano Lupo Pasini <[email protected]> Co-authored-by: Massimiliano Lupo Pasini <[email protected]> Co-authored-by: Massimiliano Lupo Pasini <[email protected]>
ORNL · Mar 31, 2024 · a1f1c43 · a1f1c43
1 parent 89d4881
commit a1f1c43
Show file tree

Hide file tree

Showing 4 changed files with 833 additions and 0 deletions.
diff --git a/examples/qm7x/inference.py b/examples/qm7x/inference.py
@@ -0,0 +1,222 @@
+##############################################################################
+# Copyright (c) 2021, Oak Ridge National Laboratory                          #
+# All rights reserved.                                                       #
+#                                                                            #
+# This file is part of HydraGNN and is distributed under a BSD 3-clause      #
+# license. For the licensing terms see the LICENSE file in the top-level     #
+# directory.                                                                 #
+#                                                                            #
+# SPDX-License-Identifier: BSD-3-Clause                                      #
+##############################################################################
+
+import json, os
+import sys
+import logging
+import pickle
+from tqdm import tqdm
+from mpi4py import MPI
+import argparse
+
+import torch
+import numpy as np
+
+import hydragnn
+from hydragnn.utils.time_utils import Timer
+from hydragnn.utils.distributed import get_device
+from hydragnn.utils.model import load_existing_model
+from hydragnn.utils.pickledataset import SimplePickleDataset
+from hydragnn.utils.config_utils import (
+    update_config,
+)
+from hydragnn.models.create import create_model_config
+from hydragnn.preprocess import create_dataloaders
+
+from scipy.interpolate import griddata
+
+try:
+    from hydragnn.utils.adiosdataset import AdiosWriter, AdiosDataset
+except ImportError:
+    pass
+
+import matplotlib.pyplot as plt
+
+plt.rcParams.update({"font.size": 16})
+
+
+def get_log_name_config(config):
+    return (
+        config["NeuralNetwork"]["Architecture"]["model_type"]
+        + "-r-"
+        + str(config["NeuralNetwork"]["Architecture"]["radius"])
+        + "-ncl-"
+        + str(config["NeuralNetwork"]["Architecture"]["num_conv_layers"])
+        + "-hd-"
+        + str(config["NeuralNetwork"]["Architecture"]["hidden_dim"])
+        + "-ne-"
+        + str(config["NeuralNetwork"]["Training"]["num_epoch"])
+        + "-lr-"
+        + str(config["NeuralNetwork"]["Training"]["Optimizer"]["learning_rate"])
+        + "-bs-"
+        + str(config["NeuralNetwork"]["Training"]["batch_size"])
+        + "-node_ft-"
+        + "".join(
+            str(x)
+            for x in config["NeuralNetwork"]["Variables_of_interest"][
+                "input_node_features"
+            ]
+        )
+        + "-task_weights-"
+        + "".join(
+            str(weigh) + "-"
+            for weigh in config["NeuralNetwork"]["Architecture"]["task_weights"]
+        )
+    )
+
+
+def getcolordensity(xdata, ydata):
+    ###############################
+    nbin = 20
+    hist2d, xbins_edge, ybins_edge = np.histogram2d(x=xdata, y=ydata, bins=[nbin, nbin])
+    xbin_cen = 0.5 * (xbins_edge[0:-1] + xbins_edge[1:])
+    ybin_cen = 0.5 * (ybins_edge[0:-1] + ybins_edge[1:])
+    BCTY, BCTX = np.meshgrid(ybin_cen, xbin_cen)
+    hist2d = hist2d / np.amax(hist2d)
+    print(np.amax(hist2d))
+
+    bctx1d = np.reshape(BCTX, len(xbin_cen) * nbin)
+    bcty1d = np.reshape(BCTY, len(xbin_cen) * nbin)
+    loc_pts = np.zeros((len(xbin_cen) * nbin, 2))
+    loc_pts[:, 0] = bctx1d
+    loc_pts[:, 1] = bcty1d
+    hist2d_norm = griddata(
+        loc_pts,
+        hist2d.reshape(len(xbin_cen) * nbin),
+        (xdata, ydata),
+        method="linear",
+        fill_value=0,
+    )  # np.nan)
+    return hist2d_norm
+
+
+def info(*args, logtype="info", sep=" "):
+    getattr(logging, logtype)(sep.join(map(str, args)))
+
+
+if __name__ == "__main__":
+
+    modelname = "qm7x"
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--inputfile", help="input file", type=str, default="./logs/qm7x/config.json"
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--adios",
+        help="Adios gan_dataset",
+        action="store_const",
+        dest="format",
+        const="adios",
+    )
+    group.add_argument(
+        "--pickle",
+        help="Pickle gan_dataset",
+        action="store_const",
+        dest="format",
+        const="pickle",
+    )
+    parser.set_defaults(format="pickle")
+
+    args = parser.parse_args()
+
+    dirpwd = os.path.dirname(os.path.abspath(__file__))
+    input_filename = os.path.join(dirpwd, args.inputfile)
+    with open(input_filename, "r") as f:
+        config = json.load(f)
+    hydragnn.utils.setup_log(get_log_name_config(config))
+    ##################################################################################################################
+    # Always initialize for multi-rank training.
+    comm_size, rank = hydragnn.utils.setup_ddp()
+    ##################################################################################################################
+    comm = MPI.COMM_WORLD
+
+    datasetname = "qm7x"
+
+    comm.Barrier()
+
+    timer = Timer("load_data")
+    timer.start()
+    if args.format == "pickle":
+        info("Pickle load")
+        basedir = os.path.join(
+            os.path.dirname(__file__), "dataset", "%s.pickle" % modelname
+        )
+        trainset = SimplePickleDataset(
+            basedir=basedir,
+            label="trainset",
+            var_config=config["NeuralNetwork"]["Variables_of_interest"],
+        )
+        valset = SimplePickleDataset(
+            basedir=basedir,
+            label="valset",
+            var_config=config["NeuralNetwork"]["Variables_of_interest"],
+        )
+        testset = SimplePickleDataset(
+            basedir=basedir,
+            label="testset",
+            var_config=config["NeuralNetwork"]["Variables_of_interest"],
+        )
+        pna_deg = trainset.pna_deg
+    else:
+        raise NotImplementedError("No supported format: %s" % (args.format))
+
+    model = create_model_config(
+        config=config["NeuralNetwork"],
+        verbosity=config["Verbosity"]["level"],
+    )
+
+    model = torch.nn.parallel.DistributedDataParallel(model)
+
+    load_existing_model(model, modelname, path="./logs/")
+    model.eval()
+
+    variable_index = 0
+    for output_name, output_type, output_dim in zip(
+        config["NeuralNetwork"]["Variables_of_interest"]["output_names"],
+        config["NeuralNetwork"]["Variables_of_interest"]["type"],
+        config["NeuralNetwork"]["Variables_of_interest"]["output_dim"],
+    ):
+
+        test_MAE = 0.0
+
+        num_samples = len(testset)
+        true_values = []
+        predicted_values = []
+
+        for data_id, data in enumerate(tqdm(testset)):
+            predicted = model(data.to(get_device()))
+            predicted = predicted[variable_index].flatten()
+            start = data.y_loc[0][variable_index].item()
+            end = data.y_loc[0][variable_index + 1].item()
+            true = data.y[start:end, 0]
+            test_MAE += torch.norm(predicted - true, p=1).item() / len(testset)
+            predicted_values.extend(predicted.tolist())
+            true_values.extend(true.tolist())
+
+        hist2d_norm = getcolordensity(true_values, predicted_values)
+
+        fig, ax = plt.subplots()
+        plt.scatter(true_values, predicted_values, s=8, c=hist2d_norm, vmin=0, vmax=1)
+        plt.clim(0, 1)
+        ax.plot(ax.get_xlim(), ax.get_xlim(), ls="--", color="red")
+        plt.colorbar()
+        plt.xlabel("True values")
+        plt.ylabel("Predicted values")
+        plt.title(f"{output_name}")
+        plt.draw()
+        plt.tight_layout()
+        plt.savefig(f"./{output_name}_Scatterplot" + ".png", dpi=400)
+
+        print(f"Test MAE {output_name}: ", test_MAE)
+
+        variable_index += 1
diff --git a/examples/qm7x/qm7x.json b/examples/qm7x/qm7x.json
@@ -0,0 +1,67 @@
+{
+  "Verbosity": {
+    "level": 2
+  },
+  "NeuralNetwork": {
+    "Architecture": {
+      "model_type": "EGNN",
+      "edge_features": ["bond_length"],
+      "equivariance": true,
+      "max_neighbours": 20,
+      "num_gaussians": 50,
+      "num_filters": 50,
+      "envelope_exponent": 5,
+      "int_emb_size": 64,
+      "basis_emb_size": 8,
+      "out_emb_size": 128,
+      "num_after_skip": 2,
+      "num_before_skip": 1,
+      "num_radial": 6,
+      "num_spherical": 7,
+      "radius": 5,
+      "hidden_dim": 200,
+      "num_conv_layers": 6,
+      "output_heads": {
+        "graph": {
+          "num_sharedlayers": 2,
+          "dim_sharedlayers": 200,
+          "num_headlayers": 2,
+          "dim_headlayers": [
+            1000,
+            1000
+          ]
+        },
+        "node": {
+          "num_headlayers": 2,
+          "dim_headlayers": [1000,1000],
+          "type": "mlp"
+        }
+      },
+      "task_weights": [
+        1, 1, 1, 1, 1
+      ]
+    },
+    "Variables_of_interest": {
+      "input_node_features": [0, 1, 2, 3],
+      "output_index": [
+        0, 1, 2, 3, 4
+      ],
+      "type": [
+        "graph", "node", "node", "node", "node"
+      ],
+      "output_dim": [1, 3, 1, 1, 1],
+      "output_names": ["HLGAP", "forces", "hCHG", "hVDIP", "hRAT"],
+      "denormalize_output": false
+    },
+    "Training": {
+      "Checkpoint" : true,
+      "num_epoch": 20,
+      "batch_size": 32,
+      "continue": 1,
+      "startfrom": "/gpfs/alpine/lrn026/world-shared/HydraGNN_Max_QM7X/HydraGNN/logs/qm7x_fullx/qm7x_fullx",
+      "Optimizer": {
+        "learning_rate": 0.001
+      }
+    }
+  }
+}
diff --git a/examples/qm7x/qm7x_single_tasking.json b/examples/qm7x/qm7x_single_tasking.json
@@ -0,0 +1,65 @@
+{
+  "Verbosity": {
+    "level": 2
+  },
+  "NeuralNetwork": {
+    "Architecture": {
+      "model_type": "EGNN",
+      "edge_features": ["bond_length"],
+      "max_neighbours": 20,
+      "num_gaussians": 50,
+      "num_filters": 50,
+      "envelope_exponent": 5,
+      "int_emb_size": 64,
+      "basis_emb_size": 8,
+      "out_emb_size": 128,
+      "num_after_skip": 2,
+      "num_before_skip": 1,
+      "num_radial": 6,
+      "num_spherical": 7,
+      "radius": 5,
+      "hidden_dim": 200,
+      "num_conv_layers": 6,
+      "output_heads": {
+        "graph": {
+          "num_sharedlayers": 2,
+          "dim_sharedlayers": 200,
+          "num_headlayers": 2,
+          "dim_headlayers": [
+            1000,
+            1000
+          ]
+        },
+        "node": {
+          "num_headlayers": 2,
+          "dim_headlayers": [1000,1000],
+          "type": "mlp"
+        }
+      },
+      "task_weights": [
+        1
+      ]
+    },
+    "Variables_of_interest": {
+      "input_node_features": [0, 1, 2, 3],
+      "output_index": [
+        0
+      ],
+      "type": [
+        "graph"
+      ],
+      "output_dim": [1],
+      "output_names": ["HLGAP"],
+      "denormalize_output": false
+    },
+    "Training": {
+      "num_epoch": 3,
+      "batch_size": 1,
+      "continue": 0,
+      "startfrom": "existing_model",
+      "Optimizer": {
+        "learning_rate": 0.001
+      }
+    }
+  }
+}