Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open catalyst #211

Merged
merged 27 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
dfed076
open catalysts pre-load scripts added
allaffa Mar 10, 2024
6e61df1
train.py added
allaffa Mar 10, 2024
d20e266
data loading, data processing, and training added
allaffa Mar 11, 2024
a399d97
test set changed to validation because we need labels
Mar 12, 2024
d1f8c0d
exception handler in data loader removed
Mar 12, 2024
2634858
skip command added if len(traj_logs) != len(traj_frames)
Mar 12, 2024
b2f552b
fix adios
Mar 13, 2024
9e82185
Merge pull request #10 from jychoi-hpc/dev-open-catalyst
allaffa Mar 13, 2024
bf21bea
removing second np.split
Mar 13, 2024
3fe60ea
uncompress file added
Mar 13, 2024
4141d06
energy float converted into tensor
allaffa Mar 13, 2024
df3880b
add update_predicted_values in adios reader
jychoi-hpc Mar 13, 2024
f34171c
add var_config for adios
jychoi-hpc Mar 13, 2024
f055291
Merge pull request #11 from jychoi-hpc/dev-open-catalyst
allaffa Mar 13, 2024
3a31ae0
file reader uses glob.iglob
Mar 13, 2024
7bba7b6
Merge branch 'open_catalyst' of https://github.com/allaffa/HydraGNN i…
Mar 13, 2024
00c469a
minor update
Mar 13, 2024
2b2a3e1
Merge pull request #12 from jychoi-hpc/dev-open-catalyst
allaffa Mar 14, 2024
8bfa8d9
conflict merged
Mar 14, 2024
00caab7
choice of adios or pickle in pre-loading
Mar 14, 2024
4f715fc
minor fixes
jychoi-hpc Mar 14, 2024
1eddd13
Merge branch 'open_catalyst' into dev-open-catalyst
jychoi-hpc Mar 14, 2024
48cc105
Merge pull request #13 from jychoi-hpc/dev-open-catalyst
allaffa Mar 14, 2024
84fcaf9
formatting fixed
allaffa Mar 14, 2024
7e47fcf
black reformatting of utils/adiosdataset.py
allaffa Mar 14, 2024
fa100bd
black reformarring of scripts in examples/open_catalyst_2020
allaffa Mar 14, 2024
1d1c06e
Dataset section in the JSON file is removed
allaffa Mar 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 153 additions & 0 deletions examples/open_catalyst_2020/download_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import argparse
import glob
import logging
import os

"""
This script provides users with an automated way to download, preprocess (where
applicable), and organize data to readily be used by the existing config files.
"""

DOWNLOAD_LINKS = {
"s2ef": {
"200k": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_200K.tar",
"2M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_2M.tar",
"20M": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_20M.tar",
"all": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_train_all.tar",
"val_id": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_id.tar",
"val_ood_ads": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_ads.tar",
"val_ood_cat": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_cat.tar",
"val_ood_both": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_val_ood_both.tar",
"test": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_test_lmdbs.tar.gz",
"rattled": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_rattled.tar",
"md": "https://dl.fbaipublicfiles.com/opencatalystproject/data/s2ef_md.tar",
},
"is2re": "https://dl.fbaipublicfiles.com/opencatalystproject/data/is2res_train_val_test_lmdbs.tar.gz",
}

S2EF_COUNTS = {
"s2ef": {
"200k": 200000,
"2M": 2000000,
"20M": 20000000,
"all": 133934018,
"val_id": 999866,
"val_ood_ads": 999838,
"val_ood_cat": 999809,
"val_ood_both": 999944,
"rattled": 16677031,
"md": 38315405,
},
}


def get_data(datadir, task, split, del_intmd_files):
os.makedirs(datadir, exist_ok=True)

if task == "s2ef" and split is None:
raise NotImplementedError("S2EF requires a split to be defined.")

if task == "s2ef":
assert (
split in DOWNLOAD_LINKS[task]
), f'S2EF "{split}" split not defined, please specify one of the following: {list(DOWNLOAD_LINKS["s2ef"].keys())}'
download_link = DOWNLOAD_LINKS[task][split]

elif task == "is2re":
download_link = DOWNLOAD_LINKS[task]

os.system(f"wget {download_link} -P {datadir}")
filename = os.path.join(datadir, os.path.basename(download_link))
logging.info("Extracting contents...")
os.system(f"tar -xvf {filename} -C {datadir}")
dirname = os.path.join(
datadir,
os.path.basename(filename).split(".")[0],
)
if task == "s2ef" and split != "test":
compressed_dir = os.path.join(dirname, os.path.basename(dirname))
if split in ["200k", "2M", "20M", "all", "rattled", "md"]:
output_path = os.path.join(datadir, task, split, "train")
else:
output_path = os.path.join(datadir, task, "all", split)
uncompressed_dir = uncompress_data(compressed_dir)
# preprocess_data(uncompressed_dir, output_path)

# verify_count(output_path, task, split)
if task == "s2ef" and split == "test":
if not (os.path.exists(f"{datadir}/s2ef/all")):
os.makedirs(f"{datadir}/s2ef/all")
os.system(f"mv {dirname}/test_data/s2ef/all/test_* {datadir}/s2ef/all")
elif task == "is2re":
os.system(f"mv {dirname}/data/is2re {datadir}")

# if del_intmd_files:
# cleanup(filename, dirname)


def uncompress_data(compressed_dir):
import uncompress

parser = uncompress.get_parser()
args, _ = parser.parse_known_args()
args.ipdir = compressed_dir
args.opdir = os.path.dirname(compressed_dir) + "_uncompressed"
uncompress.main(args)
return args.opdir


def preprocess_data(uncompressed_dir, output_path):
import preprocess as preprocess_ad

args.data_path = uncompressed_dir
args.out_path = output_path
preprocess_ad.main(args)


def verify_count(output_path, task, split):
paths = glob.glob(os.path.join(output_path, "*.txt"))
count = 0
for path in paths:
lines = open(path, "r").read().splitlines()
count += len(lines)
assert (
count == S2EF_COUNTS[task][split]
), f"S2EF {split} count incorrect, verify preprocessing has completed successfully."


def cleanup(filename, dirname):
import shutil

if os.path.exists(filename):
os.remove(filename)
if os.path.exists(dirname):
shutil.rmtree(dirname)
if os.path.exists(dirname + "_uncompressed"):
shutil.rmtree(dirname + "_uncompressed")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--task", type=str, help="Task to download")
parser.add_argument(
"--split", type=str, help="Corresponding data split to download"
)
parser.add_argument(
"--keep",
action="store_true",
help="Keep intermediate directories and files upon data retrieval/processing",
)
parser.add_argument(
"--data-path",
type=str,
default="./dataset",
help="Specify path to save dataset. Defaults to './dataset'",
)

args, _ = parser.parse_known_args()
get_data(
datadir=args.data_path,
task=args.task,
split=args.split,
del_intmd_files=not args.keep,
)
58 changes: 58 additions & 0 deletions examples/open_catalyst_2020/open_catalyst_energy.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"Verbosity": {
"level": 2
},
"NeuralNetwork": {
"Architecture": {
"model_type": "EGNN",
"equivariance": true,
"radius": 5.0,
"max_neighbours": 100000,
"num_gaussians": 50,
"envelope_exponent": 5,
"int_emb_size": 64,
"basis_emb_size": 8,
"out_emb_size": 128,
"num_after_skip": 2,
"num_before_skip": 1,
"num_radial": 6,
"num_spherical": 7,
"num_filters": 126,
"edge_features": ["coord_x", "coord_y", "coord_z"],
"hidden_dim": 50,
"num_conv_layers": 3,
"output_heads": {
"graph":{
"num_sharedlayers": 2,
"dim_sharedlayers": 50,
"num_headlayers": 2,
"dim_headlayers": [50,25]
}
},
"task_weights": [1.0]
},
"Variables_of_interest": {
"input_node_features": [0, 1, 2, 3],
"output_names": ["energy"],
"output_index": [0],
"output_dim": [1],
"type": ["graph"]
},
"Training": {
"num_epoch": 50,
"perc_train": 0.8,
"loss_function_type": "mae",
"batch_size": 32,
"continue": 0,
"Optimizer": {
"type": "AdamW",
"learning_rate": 1e-3
}
}
},
"Visualization": {
"plot_init_solution": true,
"plot_hist_solution": false,
"create_plots": true
}
}
58 changes: 58 additions & 0 deletions examples/open_catalyst_2020/open_catalyst_forces.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"Verbosity": {
"level": 2
},
"NeuralNetwork": {
"Architecture": {
"model_type": "EGNN",
"equivariance": true,
"radius": 5.0,
"max_neighbours": 100000,
"num_gaussians": 50,
"envelope_exponent": 5,
"int_emb_size": 64,
"basis_emb_size": 8,
"out_emb_size": 128,
"num_after_skip": 2,
"num_before_skip": 1,
"num_radial": 6,
"num_spherical": 7,
"num_filters": 126,
"edge_features": ["coord_x", "coord_y", "coord_z"],
"hidden_dim": 50,
"num_conv_layers": 3,
"output_heads": {
"node": {
"num_headlayers": 2,
"dim_headlayers": [200,200],
"type": "mlp"
}
},
"task_weights": [1.0]
},
"Variables_of_interest": {
"input_node_features": [0, 1, 2, 3],
"output_names": ["forces"],
"output_index": [2],
"output_dim": [3],
"type": ["node"]
},
"Training": {
"num_epoch": 50,
"EarlyStopping": true,
"perc_train": 0.9,
"loss_function_type": "mae",
"batch_size": 32,
"continue": 0,
"Optimizer": {
"type": "AdamW",
"learning_rate": 1e-3
}
}
},
"Visualization": {
"plot_init_solution": true,
"plot_hist_solution": false,
"create_plots": true
}
}
Loading
Loading