diff --git a/CHANGELOG.md b/CHANGELOG.md index f596606c..95016db8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The rules for this file: talagayev, NDoering99 ### Added +- Addition of two `protein` and `residue` interaction in binding modes (2026-01-19, PR#170) - Addition of force field version selection for `GAFF` & `SMINROFF` (2025-12-15, PR #169) - Addition of `CHARM2024` forcefield (2025-11-03, PR #163) diff --git a/openmmdl/openmmdl_analysis/analysis/bindingmodes.py b/openmmdl/openmmdl_analysis/analysis/bindingmodes.py index 43eae1a6..de43c632 100644 --- a/openmmdl/openmmdl_analysis/analysis/bindingmodes.py +++ b/openmmdl/openmmdl_analysis/analysis/bindingmodes.py @@ -48,12 +48,14 @@ def __init__( interaction_list, threshold, total_frames, + schema="residue", ): self.pdb_md = pdb_md self.ligand = ligand self.peptide = peptide self.special = special self.threshold = threshold + self.schema = schema self.ligand_rings = ligand_rings self.total_frames = total_frames self.unique_columns_rings_grouped = self._gather_interactions(interaction_list) @@ -84,17 +86,20 @@ def _gather_interactions(self, df): # Check if the 'INTERACTION' is 'hydrophobic' if row["INTERACTION"] == "hydrophobic": # Get the values from the current row - prot_partner = row["Prot_partner"] - ligcarbonidx = row["LIGCARBONIDX"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" + ligcarbonidx = int(row["LIGCARBONIDX"]) interaction = row["INTERACTION"] ring_found = False # Concatenate the values to form a unique column name - for ligand_ring in self.ligand_rings: + for ligand_ring in (self.ligand_rings or []): if ligcarbonidx in ligand_ring: numbers_as_strings = [str(ligcarbonidx) for ligcarbonidx in ligand_ring] # Create the name with numbers separated by underscores name_with_numbers = "_".join(numbers_as_strings) - col_name = f"{prot_partner}_{name_with_numbers}_{interaction}" + if self.schema != "residue": + col_name = f"ligand_{name_with_numbers}_{interaction}" + else: + col_name = f"{prot_partner}_{name_with_numbers}_{interaction}" ring_found = True break if not ring_found: @@ -102,19 +107,19 @@ def _gather_interactions(self, df): col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}" elif row["INTERACTION"] == "hbond": if row["PROTISDON"]: - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligcarbonidx = int(row["ACCEPTORIDX"]) interaction = row["INTERACTION"] type = "Acceptor" elif not row["PROTISDON"]: - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligcarbonidx = int(row["DONORIDX"]) interaction = row["INTERACTION"] type = "Donor" # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" elif row["INTERACTION"] == "halogen": - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligcarbonidx = int(row["DON_IDX"]) halogen = row["DONORTYPE"] interaction = row["INTERACTION"] @@ -122,27 +127,27 @@ def _gather_interactions(self, df): col_name = f"{prot_partner}_{ligcarbonidx}_{halogen}_{interaction}" elif row["INTERACTION"] == "waterbridge": if row["PROTISDON"]: - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligcarbonidx = int(row["ACCEPTOR_IDX"]) interaction = row["INTERACTION"] type = "Acceptor" # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" elif not row["PROTISDON"]: - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligcarbonidx = int(row["DONOR_IDX"]) interaction = row["INTERACTION"] type = "Donor" # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligcarbonidx}_{type}_{interaction}" elif row["INTERACTION"] == "pistacking": - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligcarbonidx = row["LIG_IDX_LIST"] interaction = row["INTERACTION"] # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{ligcarbonidx}_{interaction}" elif row["INTERACTION"] == "pication": - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligidx = row["LIG_IDX_LIST"] ligtype = row["LIG_GROUP"] interaction = row["INTERACTION"] @@ -150,7 +155,7 @@ def _gather_interactions(self, df): col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}" col_name = col_name.replace(",", "_") elif row["INTERACTION"] == "saltbridge": - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligidx = row["LIG_IDX_LIST"] lig_group = row["LIG_GROUP"] interaction = row["INTERACTION"] @@ -181,26 +186,26 @@ def _gather_interactions(self, df): # Check if the 'INTERACTION' is 'hydrophobic' if row["INTERACTION"] == "hydrophobic": # Get the values from the current row - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] interaction = row["INTERACTION"] ring_found = False col_name = f"{prot_partner}_{peptide_partner}_{interaction}" elif row["INTERACTION"] == "hbond": if row["PROTISDON"]: - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] interaction = row["INTERACTION"] type = "Acceptor" elif not row["PROTISDON"]: - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] interaction = row["INTERACTION"] type = "Donor" # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{peptide_partner}_{type}_{interaction}" elif row["INTERACTION"] == "halogen": - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] halogen = row["DONORTYPE"] interaction = row["INTERACTION"] @@ -208,27 +213,27 @@ def _gather_interactions(self, df): col_name = f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}" elif row["INTERACTION"] == "waterbridge": if row["PROTISDON"]: - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] interaction = row["INTERACTION"] type = "Acceptor" # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{peptide_partner}_{type}_{interaction}" elif not row["PROTISDON"]: - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] interaction = row["INTERACTION"] type = "Donor" # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{peptide_partner}_{type}_{interaction}" elif row["INTERACTION"] == "pistacking": - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] interaction = row["INTERACTION"] # Concatenate the values to form a unique column name col_name = f"{prot_partner}_{peptide_partner}_{interaction}" elif row["INTERACTION"] == "pication": - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] ligtype = row["RESTYPE_LIG"] interaction = row["INTERACTION"] @@ -236,7 +241,7 @@ def _gather_interactions(self, df): col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}" col_name = col_name.replace(",", "_") elif row["INTERACTION"] == "saltbridge": - prot_partner = row["Prot_partner"] + prot_partner = row["Prot_partner"] if self.schema == "residue" else "ligand" ligidx = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] lig_group = row["RESTYPE_LIG"] interaction = row["INTERACTION"] @@ -379,246 +384,383 @@ def _df_iteration_numbering(self, df, unique_data): None Modifies the input DataFrame in-place by appending columns corresponding to recurring interactions. """ - if self.peptide is None: - for index, row in df.iterrows(): - if row["INTERACTION"] == "hydrophobic": - for col in unique_data.values(): - if "hydrophobic" in col: - ligcarbonidx_check = str(int(row["LIGCARBONIDX"])) - if ligcarbonidx_check in col: - parts = col.split("_") - prot_partner = parts[0] - interaction = parts[-1] - condition = (row["Prot_partner"] == prot_partner) & (row["INTERACTION"] == interaction) - df.at[index, col] = 1 if condition else 0 - else: - continue - elif row["INTERACTION"] == "hbond": - if row["PROTISDON"]: + if self.schema == "residue": + if self.peptide is None: + for index, row in df.iterrows(): + if row["INTERACTION"] == "hydrophobic": + for col in unique_data.values(): + if "hydrophobic" in col: + ligcarbonidx_check = str(int(row["LIGCARBONIDX"])) + if ligcarbonidx_check in col: + parts = col.split("_") + prot_partner = parts[0] + interaction = parts[-1] + condition = (row["Prot_partner"] == prot_partner) & (row["INTERACTION"] == interaction) + df.at[index, col] = 1 if condition else 0 + else: + continue + elif row["INTERACTION"] == "hbond": + if row["PROTISDON"]: + for col in unique_data.values(): + if "hbond" in col: + prot_partner, ligcarbonidx, type, interaction = col.split("_") + ligcarbonidx = int(ligcarbonidx) + condition = ( + (row["Prot_partner"] == prot_partner) + & (int(row["ACCEPTORIDX"]) == ligcarbonidx) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif not row["PROTISDON"]: + for col in unique_data.values(): + if "hbond" in col: + prot_partner, ligcarbonidx, type, interaction = col.split("_") + ligcarbonidx = int(ligcarbonidx) + condition = ( + (row["Prot_partner"] == prot_partner) + & (int(row["DONORIDX"]) == ligcarbonidx) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif row["INTERACTION"] == "halogen": for col in unique_data.values(): - if "hbond" in col: - prot_partner, ligcarbonidx, type, interaction = col.split("_") + if "halogen" in col: + prot_partner, ligcarbonidx, halogen, interaction = col.split("_") ligcarbonidx = int(ligcarbonidx) condition = ( (row["Prot_partner"] == prot_partner) - & (int(row["ACCEPTORIDX"]) == ligcarbonidx) + & (int(row["DON_IDX"]) == ligcarbonidx) + & (row["DONORTYPE"] == halogen) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif not row["PROTISDON"]: + elif row["INTERACTION"] == "pistacking": for col in unique_data.values(): - if "hbond" in col: - prot_partner, ligcarbonidx, type, interaction = col.split("_") - ligcarbonidx = int(ligcarbonidx) + if "pistacking" in col: + prot_partner, ligcarbonidx, interaction = col.split("_") condition = ( (row["Prot_partner"] == prot_partner) - & (int(row["DONORIDX"]) == ligcarbonidx) + & (row["LIG_IDX_LIST"] == ligcarbonidx) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "halogen": - for col in unique_data.values(): - if "halogen" in col: - prot_partner, ligcarbonidx, halogen, interaction = col.split("_") - ligcarbonidx = int(ligcarbonidx) - condition = ( - (row["Prot_partner"] == prot_partner) - & (int(row["DON_IDX"]) == ligcarbonidx) - & (row["DONORTYPE"] == halogen) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "pistacking": - for col in unique_data.values(): - if "pistacking" in col: - prot_partner, ligcarbonidx, interaction = col.split("_") - condition = ( - (row["Prot_partner"] == prot_partner) - & (row["LIG_IDX_LIST"] == ligcarbonidx) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "waterbridge": - for col in unique_data.values(): - if "waterbridge" in col: - if row["PROTISDON"]: - prot_partner, ligcarbonidx, type, interaction = col.split("_") + elif row["INTERACTION"] == "waterbridge": + for col in unique_data.values(): + if "waterbridge" in col: + if row["PROTISDON"]: + prot_partner, ligcarbonidx, type, interaction = col.split("_") + condition = ( + (row["Prot_partner"] == prot_partner) + & (int(row["ACCEPTOR_IDX"]) == int(ligcarbonidx)) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif not row["PROTISDON"]: + prot_partner, ligcarbonidx, type, interaction = col.split("_") + condition = ( + (row["Prot_partner"] == prot_partner) + & (int(row["DONOR_IDX"]) == int(ligcarbonidx)) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif row["INTERACTION"] == "pication": + for col in unique_data.values(): + if "pication" in col: + parts = col.split("_") + prot_partner = parts[0] + ligidx = parts[1:-2] + ligidx = ",".join(ligidx) + ligtype = parts[-2] + interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) - & (int(row["ACCEPTOR_IDX"]) == int(ligcarbonidx)) + & (row["LIG_IDX_LIST"] == ligidx) + & (row["LIG_GROUP"] == ligtype) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif not row["PROTISDON"]: - prot_partner, ligcarbonidx, type, interaction = col.split("_") + elif row["INTERACTION"] == "saltbridge": + for col in unique_data.values(): + if "saltbridge" in col: + parts = col.split("_") + prot_partner = parts[0] + ligidx = parts[1:-3] + ligidx = ",".join(ligidx) + lig_group = parts[-3] + interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) - & (int(row["DONOR_IDX"]) == int(ligcarbonidx)) + & (row["LIG_IDX_LIST"] == ligidx) + & (row["LIG_GROUP"] == lig_group) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "pication": - for col in unique_data.values(): - if "pication" in col: - parts = col.split("_") - prot_partner = parts[0] - ligidx = parts[1:-2] - ligidx = ",".join(ligidx) - ligtype = parts[-2] - interaction = parts[-1] - condition = ( - (row["Prot_partner"] == prot_partner) - & (row["LIG_IDX_LIST"] == ligidx) - & (row["LIG_GROUP"] == ligtype) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "saltbridge": - for col in unique_data.values(): - if "saltbridge" in col: - parts = col.split("_") - prot_partner = parts[0] - ligidx = parts[1:-3] - ligidx = ",".join(ligidx) - lig_group = parts[-3] - interaction = parts[-1] - condition = ( - (row["Prot_partner"] == prot_partner) - & (row["LIG_IDX_LIST"] == ligidx) - & (row["LIG_GROUP"] == lig_group) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "metal": - for col in unique_data.values(): - if "metal" in col: - parts = col.split("_") - special_ligand = parts[0] - ligidx = int(parts[1]) - metal_type = parts[2] - interaction = parts[-1] - condition = ( - (row["RESTYPE_LIG"] == special_ligand) - & (int(row["TARGET_IDX"]) == ligidx) - & (row["METAL_TYPE"] == metal_type) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - if self.peptide is not None: - for index, row in df.iterrows(): - if row["INTERACTION"] == "hydrophobic": - for col in unique_data.values(): - if "hydrophobic" in col: - ligcarbonidx_check = str(int(row["RESNR_LIG"])) - if ligcarbonidx_check in col: + elif row["INTERACTION"] == "metal": + for col in unique_data.values(): + if "metal" in col: parts = col.split("_") - prot_partner = parts[0] + special_ligand = parts[0] + ligidx = int(parts[1]) + metal_type = parts[2] interaction = parts[-1] - condition = (row["Prot_partner"] == prot_partner) & (row["INTERACTION"] == interaction) + condition = ( + (row["RESTYPE_LIG"] == special_ligand) + & (int(row["TARGET_IDX"]) == ligidx) + & (row["METAL_TYPE"] == metal_type) + & (row["INTERACTION"] == interaction) + ) df.at[index, col] = 1 if condition else 0 - else: - continue - elif row["INTERACTION"] == "hbond": - if row["PROTISDON"]: + if self.peptide is not None: + for index, row in df.iterrows(): + if row["INTERACTION"] == "hydrophobic": for col in unique_data.values(): - if "hbond" in col: - prot_partner, ligcarbonidx, type, interaction = col.split("_") + if "hydrophobic" in col: + ligcarbonidx_check = str(int(row["RESNR_LIG"])) + if ligcarbonidx_check in col: + parts = col.split("_") + prot_partner = parts[0] + interaction = parts[-1] + condition = (row["Prot_partner"] == prot_partner) & (row["INTERACTION"] == interaction) + df.at[index, col] = 1 if condition else 0 + else: + continue + elif row["INTERACTION"] == "hbond": + if row["PROTISDON"]: + for col in unique_data.values(): + if "hbond" in col: + prot_partner, ligcarbonidx, type, interaction = col.split("_") + condition = ( + (row["Prot_partner"] == prot_partner) + & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif not row["PROTISDON"]: + for col in unique_data.values(): + if "hbond" in col: + prot_partner, ligcarbonidx, type, interaction = col.split("_") + condition = ( + (row["Prot_partner"] == prot_partner) + & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif row["INTERACTION"] == "halogen": + for col in unique_data.values(): + if "halogen" in col: + prot_partner, ligcarbonidx, halogen, interaction = col.split("_") condition = ( (row["Prot_partner"] == prot_partner) & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) + & (row["DONORTYPE"] == halogen) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif not row["PROTISDON"]: + elif row["INTERACTION"] == "pistacking": for col in unique_data.values(): - if "hbond" in col: - prot_partner, ligcarbonidx, type, interaction = col.split("_") + if "pistacking" in col: + prot_partner, ligcarbonidx, interaction = col.split("_") condition = ( (row["Prot_partner"] == prot_partner) & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "halogen": - for col in unique_data.values(): - if "halogen" in col: - prot_partner, ligcarbonidx, halogen, interaction = col.split("_") - condition = ( - (row["Prot_partner"] == prot_partner) - & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) - & (row["DONORTYPE"] == halogen) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "pistacking": - for col in unique_data.values(): - if "pistacking" in col: - prot_partner, ligcarbonidx, interaction = col.split("_") - condition = ( - (row["Prot_partner"] == prot_partner) - & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "waterbridge": - for col in unique_data.values(): - if "waterbridge" in col: - if row["PROTISDON"]: - prot_partner, ligcarbonidx, type, interaction = col.split("_") + elif row["INTERACTION"] == "waterbridge": + for col in unique_data.values(): + if "waterbridge" in col: + if row["PROTISDON"]: + prot_partner, ligcarbonidx, type, interaction = col.split("_") + condition = ( + (row["Prot_partner"] == prot_partner) + & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif not row["PROTISDON"]: + prot_partner, ligcarbonidx, type, interaction = col.split("_") + condition = ( + (row["Prot_partner"] == prot_partner) + & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + elif row["INTERACTION"] == "pication": + for col in unique_data.values(): + if "pication" in col: + parts = col.split("_") + prot_partner = parts[0] + ligidx = parts[1] + ligtype = parts[-2] + interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) - & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) + & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligidx) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif not row["PROTISDON"]: - prot_partner, ligcarbonidx, type, interaction = col.split("_") + elif row["INTERACTION"] == "saltbridge": + for col in unique_data.values(): + if "saltbridge" in col: + parts = col.split("_") + prot_partner = parts[0] + ligidx = parts[1] + lig_group = parts[-3] + interaction = parts[-1] condition = ( (row["Prot_partner"] == prot_partner) - & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligcarbonidx) + & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligidx) & (row["INTERACTION"] == interaction) ) df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "pication": - for col in unique_data.values(): - if "pication" in col: - parts = col.split("_") - prot_partner = parts[0] - ligidx = parts[1] - ligtype = parts[-2] - interaction = parts[-1] - condition = ( - (row["Prot_partner"] == prot_partner) - & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligidx) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "saltbridge": - for col in unique_data.values(): - if "saltbridge" in col: - parts = col.split("_") - prot_partner = parts[0] - ligidx = parts[1] - lig_group = parts[-3] - interaction = parts[-1] - condition = ( - (row["Prot_partner"] == prot_partner) - & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligidx) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 - elif row["INTERACTION"] == "metal": - for col in unique_data.values(): - if "metal" in col: - parts = col.split("_") - special_ligand = parts[0] - ligidx = parts[1] - metal_type = parts[2] - interaction = parts[-1] - condition = ( - (row["RESTYPE_LIG"] == special_ligand) - & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligidx) - & (row["METAL_TYPE"] == metal_type) - & (row["INTERACTION"] == interaction) - ) - df.at[index, col] = 1 if condition else 0 + elif row["INTERACTION"] == "metal": + for col in unique_data.values(): + if "metal" in col: + parts = col.split("_") + special_ligand = parts[0] + ligidx = parts[1] + metal_type = parts[2] + interaction = parts[-1] + condition = ( + (row["RESTYPE_LIG"] == special_ligand) + & ((str(row["RESNR_LIG"]) + row["RESTYPE_LIG"]) == ligidx) + & (row["METAL_TYPE"] == metal_type) + & (row["INTERACTION"] == interaction) + ) + df.at[index, col] = 1 if condition else 0 + return + + # schema != "residue": residue-agnostic numbering + for col in unique_data.values(): + if col not in df.columns: + df[col] = 0 + + for idx, row in df.iterrows(): + if row["INTERACTION"] == "skip": + continue + + col_name = self._row_to_colname(row, prot_partner="ligand") + + if col_name in unique_data: + df.at[idx, col_name] = 1 + + def _row_to_colname(self, row, prot_partner: str) -> str: + """ + Build the interaction column name for a single row, using the provided prot_partner token. + Must mirror _gather_interactions naming exactly. + """ + interaction = row["INTERACTION"] + if interaction == "skip": + return "skip" + + # ----------------------------- + # Small-molecule ligand case + # ----------------------------- + if self.peptide is None: + if interaction == "hydrophobic": + ligcarbonidx = int(row["LIGCARBONIDX"]) + ring_found = False + + # ligcarbonidx might be str/int depending on upstream; keep behavior consistent + for ligand_ring in (self.ligand_rings or []): + if ligcarbonidx in ligand_ring: + name_with_numbers = "_".join(str(x) for x in ligand_ring) + ring_found = True + if self.schema != "residue": + return f"ligand_{name_with_numbers}_{interaction}" + return f"{prot_partner}_{name_with_numbers}_{interaction}" + + if not ring_found: + return f"{prot_partner}_{int(row['LIGCARBONIDX'])}_{interaction}" + + elif interaction == "hbond": + if row["PROTISDON"]: + ligcarbonidx = int(row["ACCEPTORIDX"]) + type_ = "Acceptor" + else: + ligcarbonidx = int(row["DONORIDX"]) + type_ = "Donor" + return f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}" + + elif interaction == "halogen": + ligcarbonidx = int(row["DON_IDX"]) + halogen = row["DONORTYPE"] + return f"{prot_partner}_{ligcarbonidx}_{halogen}_{interaction}" + + elif interaction == "waterbridge": + if row["PROTISDON"]: + ligcarbonidx = int(row["ACCEPTOR_IDX"]) + type_ = "Acceptor" + else: + ligcarbonidx = int(row["DONOR_IDX"]) + type_ = "Donor" + return f"{prot_partner}_{ligcarbonidx}_{type_}_{interaction}" + + elif interaction == "pistacking": + ligidx = row["LIG_IDX_LIST"] # keep commas (downstream highlighter expects them) + return f"{prot_partner}_{ligidx}_{interaction}" + + elif interaction == "pication": + ligidx = str(row["LIG_IDX_LIST"]) + ligtype = row["LIG_GROUP"] + col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}" + return col_name.replace(",", "_") # MUST match _gather_interactions + + elif interaction == "saltbridge": + ligidx = row["LIG_IDX_LIST"] # keep commas + lig_group = row["LIG_GROUP"] + type_ = "NI" if row["PROTISPOS"] else "PI" + return f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" + + elif interaction == "metal": + special_ligand = row["RESTYPE_LIG"] + ligcarbonidx = int(row["TARGET_IDX"]) + metal_type = row["METAL_TYPE"] + coordination = row["COORDINATION"] + return f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}" + + return "skip" + + # ----------------------------- + # Peptide ligand case + # ----------------------------- + peptide_partner = str(row["RESNR_LIG"]) + row["RESTYPE_LIG"] + + if interaction == "hydrophobic": + return f"{prot_partner}_{peptide_partner}_{interaction}" + + elif interaction == "hbond": + type_ = "Acceptor" if row["PROTISDON"] else "Donor" + return f"{prot_partner}_{peptide_partner}_{type_}_{interaction}" + + elif interaction == "halogen": + halogen = row["DONORTYPE"] + return f"{prot_partner}_{peptide_partner}_{halogen}_{interaction}" + + elif interaction == "waterbridge": + type_ = "Acceptor" if row["PROTISDON"] else "Donor" + return f"{prot_partner}_{peptide_partner}_{type_}_{interaction}" + + elif interaction == "pistacking": + return f"{prot_partner}_{peptide_partner}_{interaction}" + + elif interaction == "pication": + ligidx = peptide_partner + ligtype = row["RESTYPE_LIG"] + col_name = f"{prot_partner}_{ligidx}_{ligtype}_{interaction}" + return col_name.replace(",", "_") + + elif interaction == "saltbridge": + ligidx = peptide_partner + lig_group = row["RESTYPE_LIG"] + type_ = "NI" if row["PROTISPOS"] else "PI" + return f"{prot_partner}_{ligidx}_{lig_group}_{type_}_{interaction}" + + elif interaction == "metal": + special_ligand = row["RESTYPE_LIG"] + ligcarbonidx = peptide_partner + metal_type = row["METAL_TYPE"] + coordination = row["COORDINATION"] + return f"{special_ligand}_{ligcarbonidx}_{metal_type}_{coordination}_{interaction}" + + return "skip" \ No newline at end of file diff --git a/openmmdl/openmmdl_analysis/analysis/rmsd.py b/openmmdl/openmmdl_analysis/analysis/rmsd.py index fc70aa61..b92e1468 100644 --- a/openmmdl/openmmdl_analysis/analysis/rmsd.py +++ b/openmmdl/openmmdl_analysis/analysis/rmsd.py @@ -121,7 +121,7 @@ def rmsd_dist_frames(self, fig_type, lig, nucleic=False): return pairwise_rmsd_prot, pairwise_rmsd_lig - def calculate_distance_matrix(self, selection): + def calculate_distance_matrix(self, selection, n_frames=None): """ Calculates the pairwise RMSD-based distance matrix for all trajectory frames for the selected atom selection. @@ -137,16 +137,20 @@ def calculate_distance_matrix(self, selection): np.ndarray Numpy array containing RMSD values between all pairs of frames. """ + if n_frames is None: + n_frames = len(self.universe.trajectory) + n_frames = min(int(n_frames), len(self.universe.trajectory)) + distances = np.zeros((len(self.universe.trajectory), len(self.universe.trajectory))) # calculate distance matrix for i in tqdm( - range(len(self.universe.trajectory)), + range(n_frames), desc="\033[1mCalculating distance matrix:\033[0m", ): self.universe.trajectory[i] frame_i = self.universe.select_atoms(selection).positions # distances[i] = md.rmsd(traj_aligned, traj_aligned, frame=i) - for j in range(i + 1, len(self.universe.trajectory)): + for j in range(i + 1, n_frames): self.universe.trajectory[j] frame_j = self.universe.select_atoms(selection).positions rmsd = self._calc_rmsd_2frames(frame_i, frame_j) diff --git a/openmmdl/openmmdl_analysis/openmmdlanalysis.py b/openmmdl/openmmdl_analysis/openmmdlanalysis.py index cfccc474..8c78097e 100644 --- a/openmmdl/openmmdl_analysis/openmmdlanalysis.py +++ b/openmmdl/openmmdl_analysis/openmmdlanalysis.py @@ -1,6 +1,7 @@ import argparse import json import os +import re import pickle import sys import warnings @@ -45,6 +46,37 @@ warnings.filterwarnings("ignore") +from contextlib import contextmanager + +@contextmanager +def pushd(path: str): + old = os.getcwd() + os.makedirs(path, exist_ok=True) + os.chdir(path) + try: + yield + finally: + os.chdir(old) + +_FLOAT_RE = re.compile(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?") + + +def parse_xyz(val): + """ + Robustly parse (x,y,z) from strings like: + "(1.0, 2.0, 3.0)" + "(np.float64(1.0), np.float64(2.0), np.float64(3.0))" + "[1.0, 2.0, 3.0]" + Returns [x,y,z] or None. + """ + if val is None or val == 0 or val == "0": + return None + s = str(val) + nums = _FLOAT_RE.findall(s) + if len(nums) < 3: + return None + return [float(nums[0]), float(nums[1]), float(nums[2])] + def main(): logo = "\n".join( @@ -179,6 +211,9 @@ def main(): ".xtc", "trr", ] + + run_root = os.getcwd() + args = parser.parse_args() if input_formats[0] not in args.topology and input_formats[4] not in args.topology: print("Topology is missing, try the absolute path") @@ -188,27 +223,26 @@ def main(): and input_formats[6] not in args.trajectory ): print("Trajectory is missing, try the absolute path") + topology = os.path.abspath(args.topology) + trajectory = os.path.abspath(args.trajectory) + if args.ligand_name is None: print("Ligand name is missing. Add the name of your ligand from your topology file") sys.exit(1) # set variables for analysis and preprocess input files - topology = args.topology - trajectory = args.trajectory # enable gromacs support and write topology and trajectory files if ".tpr" in args.topology and (".xtc" in args.trajectory or ".trr" in args.trajectory): print("\033[1mGromacs format detected. Writing compatible file formats.\033[0m") - u = mda.Universe(args.topology, args.trajectory) - with mda.Writer("trajectory.dcd", n_atoms=u.atoms.n_atoms) as W: + u = mda.Universe(topology, trajectory) + with mda.Writer(trajectory, n_atoms=u.atoms.n_atoms) as W: first_frame_saved = False for ts in u.trajectory: if not first_frame_saved: - with mda.Writer("topology.pdb", n_atoms=u.atoms.n_atoms) as pdb_writer: + with mda.Writer(topology, n_atoms=u.atoms.n_atoms) as pdb_writer: pdb_writer.write(u.atoms) first_frame_saved = True W.write(u.atoms) - pdb_md = mda.Universe("topology.pdb", "trajectory.dcd") - topology = "topology.pdb" - trajectory = "trajectory.dcd" + pdb_md = mda.Universe(topology, trajectory) water_eps = float(args.water_eps) stable_water_analysis = bool(args.stable_water_analysis) if stable_water_analysis: @@ -360,362 +394,371 @@ def main(): interaction_analysis = InteractionAnalyzer(pdb_md, dataframe, cpu_count, ligand, special_ligand, peptide, md_len) interaction_list = interaction_analysis.interaction_list interaction_list.to_csv("missing_frames_filled.csv") - interaction_list = interaction_list.reset_index(drop=True) + interaction_list_raw = interaction_list.reset_index(drop=True) # add amount of frames for Markov chains and binding modes total_frames = md_len - 1 - bmode_processor = BindingModeProcesser( - pdb_md, - ligand, - peptide, - special_ligand, - ligand_rings, - interaction_list, - threshold, - total_frames, - ) - interaction_list = bmode_processor.interaction_list - interactions_all = bmode_processor.interactions_all - unique_data = bmode_processor.unique_data - interactions_all.to_csv("df_all.csv") # Saving the dataframe - - # Group by 'FRAME' and transform each group to set all values to 1 if there is at least one 1 in each column - grouped_frames_treshold = interaction_list.groupby("FRAME", as_index=False)[list(unique_data.values())].max() - grouped_frames_treshold = grouped_frames_treshold.set_index("FRAME", drop=False) - update_values(interaction_list, grouped_frames_treshold, unique_data, "FRAME") - - # Change the FRAME column value type to int - grouped_frames_treshold["FRAME"] = grouped_frames_treshold["FRAME"].astype(int) - - # Extract all columns except 'FRAME' and the index column - selected_columns = grouped_frames_treshold.columns[1:-1] - - # Create a list of lists with the values from selected columns for each row - treshold_result_list = [row[selected_columns].values.tolist() for _, row in grouped_frames_treshold.iterrows()] - - # Calculate the occurrences of each list in the result_list - # Create a new column 'fingerprint' in the DataFrame - grouped_frames_treshold["fingerprint"] = None - - # Set the 'fingerprint' column values based on the corresponding index in result_list - for index, fingerprint_value in enumerate(treshold_result_list, 1): - grouped_frames_treshold.at[index, "fingerprint"] = fingerprint_value - - # Assuming your original DataFrame is named 'df' - # First, we'll create a new column 'Binding_fingerprint_hbond' - grouped_frames_treshold["Binding_fingerprint_treshold"] = "" - - # Dictionary to keep track of encountered fingerprints and their corresponding labels - treshold_fingerprint_dict = {} - - # Counter to generate the labels (Hbond_Binding_1, Hbond_Binding_2, etc.) - label_counter = 1 - - # Iterate through the rows and process the 'fingerprint' column - for index, row in grouped_frames_treshold.iterrows(): - fingerprint = tuple(row["fingerprint"]) - - # Check if the fingerprint has been encountered before - if fingerprint in treshold_fingerprint_dict: - grouped_frames_treshold.at[index, "Binding_fingerprint_treshold"] = treshold_fingerprint_dict[fingerprint] - else: - # Assign a new label if the fingerprint is new - label = f"Binding_Mode_{label_counter}" - treshold_fingerprint_dict[fingerprint] = label - grouped_frames_treshold.at[index, "Binding_fingerprint_treshold"] = label - label_counter += 1 - - # Group the DataFrame by the 'Binding_fingerprint_hbond' column and create the dictionary of the fingerprints - fingerprint_dict = grouped_frames_treshold["Binding_fingerprint_treshold"].to_dict() - combined_dict = {"all": []} - for key, value in fingerprint_dict.items(): - combined_dict["all"].append(value) - - # Generate Markov state figures of the binding modes - markov_analysis = MarkovChainAnalysis(min_transition) - markov_analysis.generate_transition_graph(total_frames, combined_dict, fig_type) - print("\033[1mMarkov State Figure generated\033[0m") - - # Get the top 10 nodes with the most occurrences - node_occurrences = {node: combined_dict["all"].count(node) for node in set(combined_dict["all"])} - top_10_nodes = sorted(node_occurrences, key=node_occurrences.get, reverse=True)[:10] - top_10_nodes_with_occurrences = {node: node_occurrences[node] for node in top_10_nodes} - - # Initialize an empty dictionary to store the result - columns_with_value_1 = {} - - for treshold, row_count in top_10_nodes_with_occurrences.items(): - for i in range(1, row_count + 1): - # Get the row corresponding to the treshold value - row = grouped_frames_treshold.loc[grouped_frames_treshold["Binding_fingerprint_treshold"] == treshold].iloc[ - i - 1 - ] - - # Extract column names with value 1 in that row - columns_with_1 = row[row == 1].index.tolist() - - # Convert the list to a set to remove duplicates - columns_set = set(columns_with_1) - - # Add the columns to the dictionary under the corresponding threshold - if treshold not in columns_with_value_1: - columns_with_value_1[treshold] = set() - columns_with_value_1[treshold].update(columns_set) - - # Generate an Figure for each of the binding modes with rdkit Drawer with the atoms interacting highlighted by colors - try: - if peptide is None: - interaction_processor = FigureHighlighter("complex.pdb", "lig_no_h.pdb") - matplotlib.use("Agg") - binding_site = {} - merged_image_paths = [] - for binding_mode, values in columns_with_value_1.items(): - binding_site[binding_mode] = values - occurrence_count = top_10_nodes_with_occurrences[binding_mode] - occurrence_percent = 100 * occurrence_count / total_frames - # Convert ligand to RDKit with Converter - lig_atoms = complex_lig.convert_to("RDKIT") - # Remove Hydrogens and get 2D representation - prepared_ligand = Chem.RemoveAllHs(lig_atoms) - # Generate 2D coordinates for the molecule - AllChem.Compute2DCoords(prepared_ligand) - split_data = interaction_processor.split_interaction_data(values) - # Get the highlighted atom indices based on interaction type - ( - highlighted_hbond_donor, - highlighted_hbond_acceptor, - highlighted_hbond_both, - highlighted_hydrophobic, - highlighted_waterbridge, - highlighted_pistacking, - highlighted_halogen, - highlighted_ni, - highlighted_pi, - highlighted_pication, - highlighted_metal, - ) = interaction_processor.highlight_numbers(split_data, starting_idx=lig_index) - - # Generate a dictionary for hydrogen bond acceptors - hbond_acceptor_dict = interaction_processor.generate_interaction_dict( - "hbond_acceptor", highlighted_hbond_acceptor - ) - # Generate a dictionary for hydrogen bond acceptors and donors - hbond_both_dict = interaction_processor.generate_interaction_dict("hbond_both", highlighted_hbond_both) - # Generate a dictionary for hydrogen bond donors - hbond_donor_dict = interaction_processor.generate_interaction_dict( - "hbond_donor", highlighted_hbond_donor - ) - # Generate a dictionary for hydrophobic features - hydrophobic_dict = interaction_processor.generate_interaction_dict( - "hydrophobic", highlighted_hydrophobic - ) - # Generate a dictionary for water bridge interactions - waterbridge_dict = interaction_processor.generate_interaction_dict( - "waterbridge", highlighted_waterbridge - ) - # Generate a dictionary for pistacking - pistacking_dict = interaction_processor.generate_interaction_dict("pistacking", highlighted_pistacking) - # Generate a dictionary for halogen interactions - halogen_dict = interaction_processor.generate_interaction_dict("halogen", highlighted_halogen) - # Generate a dictionary for negative ionizables - ni_dict = interaction_processor.generate_interaction_dict("ni", highlighted_ni) - # Generate a dictionary for negative ionizables - pi_dict = interaction_processor.generate_interaction_dict("pi", highlighted_pi) - # Generate a dictionary for pication - pication_dict = interaction_processor.generate_interaction_dict("pication", highlighted_pication) - # Generate a dictionary for metal interactions - metal_dict = interaction_processor.generate_interaction_dict("metal", highlighted_metal) - - # Call the function to update hbond_donor_dict with values from other dictionaries - update_dict( - hbond_donor_dict, - hbond_acceptor_dict, - ni_dict, - pi_dict, - hydrophobic_dict, - hbond_both_dict, - waterbridge_dict, - pistacking_dict, - halogen_dict, - pication_dict, - metal_dict, - ) - - # Convert the highlight_atoms to int type for rdkit drawer - highlight_atoms = [ - int(x) - for x in highlighted_hbond_donor - + highlighted_hbond_acceptor - + highlighted_hbond_both - + highlighted_ni - + highlighted_pi - + highlighted_hydrophobic - + highlighted_waterbridge - + highlighted_pistacking - + highlighted_halogen - + highlighted_pication - + highlighted_metal - ] - highlight_atoms = list(set(highlight_atoms)) - - # Convert the RDKit molecule to SVG format with atom highlights - drawer = rdMolDraw2D.MolDraw2DSVG(600, 600) - drawer.DrawMolecule( - prepared_ligand, - highlightAtoms=highlight_atoms, - highlightAtomColors=hbond_donor_dict, - ) - drawer.FinishDrawing() - svg = drawer.GetDrawingText().replace("svg:", "") - - # Save the SVG to a file - with open(f"{binding_mode}.svg", "w") as f: - f.write(svg) - - # Convert the svg to an png - cairosvg.svg2png(url=f"{binding_mode}.svg", write_to=f"{binding_mode}.png") - - # Generate the interactions legend and combine it with the ligand png - image_merger = FigureMerger(binding_mode, occurrence_percent, split_data, merged_image_paths) - merged_image_paths = image_merger.create_and_merge_images() - - # Create Figure with all Binding modes - figure_arranger = FigureArranger(merged_image_paths, "all_binding_modes_arranged.png") - figure_arranger.arranged_figure_generation() - - generator = LigandImageGenerator( + complex_pdb = os.path.join(run_root, "complex.pdb") + lig_no_h_pdb = os.path.join(run_root, "lig_no_h.pdb") + + + for schema in ("residue", "ligand"): + outdir = f"BindingModes_{schema}" + print(f"\033[1mRunning {schema} bindig mode analysis\033[0m") + + # IMPORTANT: fresh copy for each schema run + interaction_list = interaction_list_raw.copy(deep=True) + + with pushd(outdir): + # Running with Residues + bm_res = BindingModeProcesser( + pdb_md, ligand, - "complex.pdb", - "lig_no_h.pdb", - "ligand_numbering.svg", - fig_type, + peptide, + special_ligand, + ligand_rings, + interaction_list, + threshold, + total_frames, + schema, ) - generator.generate_image() - print("\033[1mBinding mode figure generated\033[0m") - except Exception: - print("Ligand could not be recognized, use the -l option") - - df_all = pd.read_csv("df_all.csv") - - pham_generator = PharmacophoreGenerator(df_all, ligand) - # get the top 10 bindingmodes with the most occurrences - binding_modes = grouped_frames_treshold["Binding_fingerprint_treshold"].str.split("\n") - all_binding_modes = [mode.strip() for sublist in binding_modes for mode in sublist] - binding_mode_counts = pd.Series(all_binding_modes).value_counts() - top_10_binding_modes = binding_mode_counts.head(10) - total_binding_modes = len(all_binding_modes) - result_dict = { - "Binding Mode": [], - "First Frame": [], - "All Frames": [], - "Representative Frame": [], - "Percentage Occurrence": [], - } - if generate_representative_frame: - DM = rmsd_analyzer.calculate_distance_matrix( - f"protein or nucleic or resname {ligand} or resname {special_ligand}" - ) - modes_to_process = top_10_binding_modes.index - for mode in tqdm(modes_to_process): - result_dict["Binding Mode"].append(mode) - first_frame = grouped_frames_treshold.loc[ - grouped_frames_treshold["Binding_fingerprint_treshold"].str.contains(mode), - "FRAME", - ].iloc[0] - all_frames = grouped_frames_treshold.loc[ - grouped_frames_treshold["Binding_fingerprint_treshold"].str.contains(mode), - "FRAME", - ].tolist() - percent_occurrence = (top_10_binding_modes[mode] / total_binding_modes) * 100 - result_dict["First Frame"].append(first_frame) - result_dict["All Frames"].append(all_frames) - result_dict["Percentage Occurrence"].append(percent_occurrence) - representative_frame = rmsd_analyzer.calculate_representative_frame(all_frames, DM) - result_dict["Representative Frame"].append(representative_frame) - top_10_binding_modes_df = pd.DataFrame(result_dict) - top_10_binding_modes_df.to_csv("top_10_binding_modes.csv") - print("\033[1mFound representative frame for each binding mode\033[0m") - # save bindingmode pdb and .pml - for index, row in top_10_binding_modes_df.iterrows(): - b_mode = row["Binding Mode"] - rep_frame = int(row["Representative Frame"]) - trajsaver.save_frame(rep_frame, f"./Binding_Modes_Markov_States/{b_mode}.pdb") + + interaction_list = bm_res.interaction_list + interactions_all = bm_res.interactions_all + unique_data = bm_res.unique_data + interactions_all.to_csv("df_all.csv") # Saving the dataframe + + # Group by 'FRAME' and transform each group to set all values to 1 if there is at least one 1 in each column + grouped_frames_treshold = interaction_list.groupby("FRAME", as_index=False)[list(unique_data.values())].max() + grouped_frames_treshold = grouped_frames_treshold.set_index("FRAME", drop=False) + update_values(interaction_list, grouped_frames_treshold, unique_data, "FRAME") + + # Change the FRAME column value type to int + grouped_frames_treshold["FRAME"] = grouped_frames_treshold["FRAME"].astype(int) + + # Extract all columns except 'FRAME' and the index column + selected_columns = grouped_frames_treshold.columns[1:-1] + + # Create a list of lists with the values from selected columns for each row + treshold_result_list = [row[selected_columns].values.tolist() for _, row in grouped_frames_treshold.iterrows()] + + # Calculate the occurrences of each list in the result_list + # Create a new column 'fingerprint' in the DataFrame + grouped_frames_treshold["fingerprint"] = None + + # Set the 'fingerprint' column values based on the corresponding index in result_list + for index, fingerprint_value in enumerate(treshold_result_list, 1): + grouped_frames_treshold.at[index, "fingerprint"] = fingerprint_value + + # Assuming your original DataFrame is named 'df' + # First, we'll create a new column 'Binding_fingerprint_hbond' + grouped_frames_treshold["Binding_fingerprint_treshold"] = "" + + # Dictionary to keep track of encountered fingerprints and their corresponding labels + treshold_fingerprint_dict = {} + + # Counter to generate the labels (Hbond_Binding_1, Hbond_Binding_2, etc.) + label_counter = 1 + + # Iterate through the rows and process the 'fingerprint' column + for index, row in grouped_frames_treshold.iterrows(): + fingerprint = tuple(row["fingerprint"]) + + # Check if the fingerprint has been encountered before + if fingerprint in treshold_fingerprint_dict: + grouped_frames_treshold.at[index, "Binding_fingerprint_treshold"] = treshold_fingerprint_dict[fingerprint] + else: + # Assign a new label if the fingerprint is new + label = f"Binding_Mode_{label_counter}" + treshold_fingerprint_dict[fingerprint] = label + grouped_frames_treshold.at[index, "Binding_fingerprint_treshold"] = label + label_counter += 1 + + # Group the DataFrame by the 'Binding_fingerprint_hbond' column and create the dictionary of the fingerprints + fingerprint_dict = grouped_frames_treshold["Binding_fingerprint_treshold"].to_dict() + combined_dict = {"all": []} + for key, value in fingerprint_dict.items(): + combined_dict["all"].append(value) + + # Generate Markov state figures of the binding modes + markov_analysis = MarkovChainAnalysis(min_transition) + markov_analysis.generate_transition_graph(total_frames, combined_dict, fig_type) + print(f"\033[1m{schema} bindig mode: markov state figure generated\033[0m") + + # Get the top 10 nodes with the most occurrences + node_occurrences = {node: combined_dict["all"].count(node) for node in set(combined_dict["all"])} + top_10_nodes = sorted(node_occurrences, key=node_occurrences.get, reverse=True)[:10] + top_10_nodes_with_occurrences = {node: node_occurrences[node] for node in top_10_nodes} + + # Initialize an empty dictionary to store the result + columns_with_value_1 = {} + + for treshold, row_count in top_10_nodes_with_occurrences.items(): + for i in range(1, row_count + 1): + # Get the row corresponding to the treshold value + row = grouped_frames_treshold.loc[grouped_frames_treshold["Binding_fingerprint_treshold"] == treshold].iloc[ + i - 1 + ] + + # Extract column names with value 1 in that row + columns_with_1 = row[row == 1].index.tolist() + + # Convert the list to a set to remove duplicates + columns_set = set(columns_with_1) + + # Add the columns to the dictionary under the corresponding threshold + if treshold not in columns_with_value_1: + columns_with_value_1[treshold] = set() + columns_with_value_1[treshold].update(columns_set) + + # Generate an Figure for each of the binding modes with rdkit Drawer with the atoms interacting highlighted by colors + try: + if peptide is None: + interaction_processor = FigureHighlighter(complex_pdb, lig_no_h_pdb) + matplotlib.use("Agg") + binding_site = {} + merged_image_paths = [] + for binding_mode, values in columns_with_value_1.items(): + binding_site[binding_mode] = values + occurrence_count = top_10_nodes_with_occurrences[binding_mode] + occurrence_percent = 100 * occurrence_count / total_frames + # Convert ligand to RDKit with Converter + lig_atoms = complex_lig.convert_to("RDKIT") + # Remove Hydrogens and get 2D representation + prepared_ligand = Chem.RemoveAllHs(lig_atoms) + # Generate 2D coordinates for the molecule + AllChem.Compute2DCoords(prepared_ligand) + split_data = interaction_processor.split_interaction_data(values) + # Get the highlighted atom indices based on interaction type + ( + highlighted_hbond_donor, + highlighted_hbond_acceptor, + highlighted_hbond_both, + highlighted_hydrophobic, + highlighted_waterbridge, + highlighted_pistacking, + highlighted_halogen, + highlighted_ni, + highlighted_pi, + highlighted_pication, + highlighted_metal, + ) = interaction_processor.highlight_numbers(split_data, starting_idx=lig_index) + + # Generate a dictionary for hydrogen bond acceptors + hbond_acceptor_dict = interaction_processor.generate_interaction_dict( + "hbond_acceptor", highlighted_hbond_acceptor + ) + # Generate a dictionary for hydrogen bond acceptors and donors + hbond_both_dict = interaction_processor.generate_interaction_dict("hbond_both", highlighted_hbond_both) + # Generate a dictionary for hydrogen bond donors + hbond_donor_dict = interaction_processor.generate_interaction_dict( + "hbond_donor", highlighted_hbond_donor + ) + # Generate a dictionary for hydrophobic features + hydrophobic_dict = interaction_processor.generate_interaction_dict( + "hydrophobic", highlighted_hydrophobic + ) + # Generate a dictionary for water bridge interactions + waterbridge_dict = interaction_processor.generate_interaction_dict( + "waterbridge", highlighted_waterbridge + ) + # Generate a dictionary for pistacking + pistacking_dict = interaction_processor.generate_interaction_dict("pistacking", highlighted_pistacking) + # Generate a dictionary for halogen interactions + halogen_dict = interaction_processor.generate_interaction_dict("halogen", highlighted_halogen) + # Generate a dictionary for negative ionizables + ni_dict = interaction_processor.generate_interaction_dict("ni", highlighted_ni) + # Generate a dictionary for negative ionizables + pi_dict = interaction_processor.generate_interaction_dict("pi", highlighted_pi) + # Generate a dictionary for pication + pication_dict = interaction_processor.generate_interaction_dict("pication", highlighted_pication) + # Generate a dictionary for metal interactions + metal_dict = interaction_processor.generate_interaction_dict("metal", highlighted_metal) + + # Call the function to update hbond_donor_dict with values from other dictionaries + update_dict( + hbond_donor_dict, + hbond_acceptor_dict, + ni_dict, + pi_dict, + hydrophobic_dict, + hbond_both_dict, + waterbridge_dict, + pistacking_dict, + halogen_dict, + pication_dict, + metal_dict, + ) + + # Convert the highlight_atoms to int type for rdkit drawer + highlight_atoms = [ + int(x) + for x in highlighted_hbond_donor + + highlighted_hbond_acceptor + + highlighted_hbond_both + + highlighted_ni + + highlighted_pi + + highlighted_hydrophobic + + highlighted_waterbridge + + highlighted_pistacking + + highlighted_halogen + + highlighted_pication + + highlighted_metal + ] + highlight_atoms = list(set(highlight_atoms)) + + # Convert the RDKit molecule to SVG format with atom highlights + drawer = rdMolDraw2D.MolDraw2DSVG(600, 600) + drawer.DrawMolecule( + prepared_ligand, + highlightAtoms=highlight_atoms, + highlightAtomColors=hbond_donor_dict, + ) + drawer.FinishDrawing() + svg = drawer.GetDrawingText().replace("svg:", "") + + # Save the SVG to a file + with open(f"{binding_mode}.svg", "w") as f: + f.write(svg) + + # Convert the svg to an png + cairosvg.svg2png(url=f"{binding_mode}.svg", write_to=f"{binding_mode}.png") + + # Generate the interactions legend and combine it with the ligand png + image_merger = FigureMerger(binding_mode, occurrence_percent, split_data, merged_image_paths) + merged_image_paths = image_merger.create_and_merge_images() + + # Create Figure with all Binding modes + figure_arranger = FigureArranger(merged_image_paths, "all_binding_modes_arranged.png") + figure_arranger.arranged_figure_generation() + + generator = LigandImageGenerator( + ligand, + complex_pdb, + lig_no_h_pdb, + "ligand_numbering.svg", + fig_type, + ) + generator.generate_image() + print(f"\033[1m{schema} bindig mode: Binding mode figure generated\033[0m") + except Exception: + print("Ligand could not be recognized, use the -l option") + + df_all = pd.read_csv("df_all.csv") + + pham_generator = PharmacophoreGenerator(df_all, ligand) + # get the top 10 bindingmodes with the most occurrences + binding_modes = grouped_frames_treshold["Binding_fingerprint_treshold"].str.split("\n") + all_binding_modes = [mode.strip() for sublist in binding_modes for mode in sublist] + binding_mode_counts = pd.Series(all_binding_modes).value_counts() + top_10_binding_modes = binding_mode_counts.head(10) + total_binding_modes = len(all_binding_modes) + result_dict = { + "Binding Mode": [], + "First Frame": [], + "All Frames": [], + "Representative Frame": [], + "Percentage Occurrence": [], + } + if generate_representative_frame: + DM = rmsd_analyzer.calculate_distance_matrix( + f"protein or nucleic or resname {ligand} or resname {special_ligand}", + n_frames=md_len - 1 + ) + modes_to_process = top_10_binding_modes.index + for mode in tqdm(modes_to_process): + result_dict["Binding Mode"].append(mode) + first_frame = grouped_frames_treshold.loc[ + grouped_frames_treshold["Binding_fingerprint_treshold"].str.contains(mode), + "FRAME", + ].iloc[0] + all_frames = grouped_frames_treshold.loc[ + grouped_frames_treshold["Binding_fingerprint_treshold"].str.contains(mode), + "FRAME", + ].tolist() + percent_occurrence = (top_10_binding_modes[mode] / total_binding_modes) * 100 + result_dict["First Frame"].append(first_frame) + result_dict["All Frames"].append(all_frames) + result_dict["Percentage Occurrence"].append(percent_occurrence) + representative_frame = rmsd_analyzer.calculate_representative_frame(all_frames, DM) + result_dict["Representative Frame"].append(representative_frame) + top_10_binding_modes_df = pd.DataFrame(result_dict) + top_10_binding_modes_df.to_csv("top_10_binding_modes.csv") + print(f"\033[1m{schema} bindig mode: found representative frame for each binding mode\033[0m") + # save bindingmode pdb and .pml + for index, row in top_10_binding_modes_df.iterrows(): + b_mode = row["Binding Mode"] + rep_frame = int(row["Representative Frame"]) + trajsaver.save_frame(rep_frame, f"./Binding_Modes_Markov_States/{b_mode}.pdb") + if generate_pml: + filtered_df_all = df_all[df_all["FRAME"] == rep_frame] + filtered_df_bindingmodes = grouped_frames_treshold[grouped_frames_treshold["FRAME"] == rep_frame] + bindingmode_dict = {} + for index, row in filtered_df_bindingmodes.iterrows(): + for column in filtered_df_bindingmodes.columns: + if column not in [ + "FRAME", + "FRAME.1", + "fingerprint", + "Binding_fingerprint_treshold", + ]: + if row[column] == 1: + if column not in bindingmode_dict: + bindingmode_dict[column] = { + "LIGCOO": [], + "PROTCOO": [], + } # Initialize a nested dictionary for each key if not already present + for index2, row2 in filtered_df_all.iterrows(): + if row2[column] == 1: + lig_xyz = parse_xyz(row2["LIGCOO"]) + prot_xyz = parse_xyz(row2["PROTCOO"]) + + if lig_xyz is None or prot_xyz is None: + continue + + bindingmode_dict[column]["LIGCOO"].append(lig_xyz) + bindingmode_dict[column]["PROTCOO"].append(prot_xyz) + + pham_generator.generate_bindingmode_pharmacophore(bindingmode_dict, b_mode) + + print(f"\033[1m {schema} bindig mode: Binding mode pdb files saved\033[0m") + + waterbridge_barcodes = {} + barcode_gen = BarcodeGenerator(df_all) + interaction_types = barcode_gen.interactions + for waterbridge_interaction in interaction_types["waterbridge"]: + barcode = barcode_gen.generate_barcode(waterbridge_interaction) + waterbridge_barcodes[waterbridge_interaction] = barcode + + # Initialize BarcodePlotter + barcode_plotter = BarcodePlotter(df_all) + + for interaction_type, interaction_data in interaction_types.items(): + barcode_plotter.plot_barcodes_grouped(interaction_data, interaction_type, fig_type) + + barcode_plotter.plot_waterbridge_piechart(waterbridge_barcodes, interaction_types["waterbridge"], fig_type) + print(f"\033[1m{schema} bindig mode: Barcodes generated\033[0m") + + interacting_water_id_list = barcode_gen.interacting_water_ids(interaction_types["waterbridge"]) + + # dump interacting waters for visualization + os.makedirs("Visualization", exist_ok=True) # Create the folder if it doesn't exist + with open("Visualization/interacting_waters.pkl", "wb") as f: + pickle.dump(interacting_water_id_list, f) + trajsaver.save_interacting_waters_trajectory(interacting_water_id_list, "./Visualization/") + + # save clouds for visualization with NGL + with open("Visualization/clouds.json", "w") as f: + json.dump(pham_generator.clouds, f) + + # generate poincloud pml for visualization if generate_pml: - filtered_df_all = df_all[df_all["FRAME"] == rep_frame] - filtered_df_bindingmodes = grouped_frames_treshold[grouped_frames_treshold["FRAME"] == rep_frame] - bindingmode_dict = {} - for index, row in filtered_df_bindingmodes.iterrows(): - for column in filtered_df_bindingmodes.columns: - if column not in [ - "FRAME", - "FRAME.1", - "fingerprint", - "Binding_fingerprint_treshold", - ]: - if row[column] == 1: - if column not in bindingmode_dict: - bindingmode_dict[column] = { - "LIGCOO": [], - "PROTCOO": [], - } # Initialize a nested dictionary for each key if not already present - for index2, row2 in filtered_df_all.iterrows(): - if row2[column] == 1: - # Extract the string representation of the tuple - tuple_string = row2["LIGCOO"] - # Split the string into individual values using a comma as the delimiter - ligcoo_values = tuple_string.strip("()").split(",") - # Convert the string values to float - ligcoo_values = [float(value.strip()) for value in ligcoo_values] - - # Extract the string representation of the tuple for PROTCOO - tuple_string = row2["PROTCOO"] - # Split the string into individual values using a comma as the delimiter - protcoo_values = tuple_string.strip("()").split(",") - # Convert the string values to float - protcoo_values = [float(value.strip()) for value in protcoo_values] - - bindingmode_dict[column]["LIGCOO"].append(ligcoo_values) - bindingmode_dict[column]["PROTCOO"].append(protcoo_values) - pham_generator.generate_bindingmode_pharmacophore(bindingmode_dict, b_mode) - - print("\033[1mBinding mode pdb files saved\033[0m") - - waterbridge_barcodes = {} - barcode_gen = BarcodeGenerator(df_all) - interaction_types = barcode_gen.interactions - for waterbridge_interaction in interaction_types["waterbridge"]: - barcode = barcode_gen.generate_barcode(waterbridge_interaction) - waterbridge_barcodes[waterbridge_interaction] = barcode - - # Initialize BarcodePlotter - barcode_plotter = BarcodePlotter(df_all) - - for interaction_type, interaction_data in interaction_types.items(): - barcode_plotter.plot_barcodes_grouped(interaction_data, interaction_type, fig_type) - - barcode_plotter.plot_waterbridge_piechart(waterbridge_barcodes, interaction_types["waterbridge"], fig_type) - print("\033[1mBarcodes generated\033[0m") - - interacting_water_id_list = barcode_gen.interacting_water_ids(interaction_types["waterbridge"]) - - # dump interacting waters for visualization - os.makedirs("Visualization", exist_ok=True) # Create the folder if it doesn't exist - with open("Visualization/interacting_waters.pkl", "wb") as f: - pickle.dump(interacting_water_id_list, f) - trajsaver.save_interacting_waters_trajectory(interacting_water_id_list, "./Visualization/") - - # save clouds for visualization with NGL - with open("Visualization/clouds.json", "w") as f: - json.dump(pham_generator.clouds, f) - - # generate poincloud pml for visualization - if generate_pml: - pham_generator.generate_point_cloud_pml("point_cloud") - # generate combo pharmacophore of the md with each interaction as a single pharmacophore feature - pham_generator.generate_md_pharmacophore_cloudcenters("combopharm") - print("\033[1mPharmacophores generated\033[0m") - + pham_generator.generate_point_cloud_pml("point_cloud") + # generate combo pharmacophore of the md with each interaction as a single pharmacophore feature + pham_generator.generate_md_pharmacophore_cloudcenters("combopharm") + print(f"\033[1m {schema} bindig mode: Pharmacophores generated\033[0m") + print("\033[1mAnalysis is Finished.\033[0m") if stable_water_analysis: diff --git a/openmmdl/openmmdl_analysis/visualization/pharmacophore.py b/openmmdl/openmmdl_analysis/visualization/pharmacophore.py index da63e588..4938858f 100644 --- a/openmmdl/openmmdl_analysis/visualization/pharmacophore.py +++ b/openmmdl/openmmdl_analysis/visualization/pharmacophore.py @@ -3,6 +3,8 @@ import xml.etree.ElementTree as ET +COORD_RE = re.compile(r"\(\s*([-\d.]+)\s*,\s*([-\d.]+)\s*,\s*([-\d.]+)\s*\)") + class PharmacophoreGenerator: """ A class for generating pharmacophore features from molecular dynamics (MD) interaction data. @@ -535,90 +537,86 @@ def _format_clouds(self, interaction_coords): } def _generate_pharmacophore_centers(self, interactions): - """ - Generates pharmacophore points for interactions that are points such as hydrophobic and ionic interactions. - - Parameters - ---------- - interactions : list of str - List of interactions to generate pharmacophore from. - - Returns - ------- - dict - Dict of interactions from which pharmacophore is generated as key and list of coordinates as value. - """ - coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)") pharmacophore = {} + for interaction in interactions: + # Only rows where this interaction occurs + if interaction not in self.df_all.columns: + continue + df_hits = self.df_all[self.df_all[interaction] == 1] + if df_hits.empty: + continue + counter = 0 - sum_x, sum_y, sum_z = 0, 0, 0 - for index, row in self.df_all.iterrows(): - if row[interaction] == 1: - coord_match = coord_pattern.match(row["LIGCOO"]) - if coord_match: - x, y, z = map(float, coord_match.groups()) - sum_x += x - sum_y += y - sum_z += z - counter += 1 - - center_x = round((sum_x / counter), 3) - center_y = round((sum_y / counter), 3) - center_z = round((sum_z / counter), 3) - pharmacophore[interaction] = [center_x, center_y, center_z] - + sum_x = sum_y = sum_z = 0.0 + + for _, row in df_hits.iterrows(): + m = COORD_RE.search(str(row.get("LIGCOO", ""))) + if not m: + continue + x, y, z = map(float, m.groups()) + sum_x += x + sum_y += y + sum_z += z + counter += 1 + + # Guard against division by zero (no parseable coords) + if counter == 0: + continue + + pharmacophore[interaction] = [ + round(sum_x / counter, 3), + round(sum_y / counter, 3), + round(sum_z / counter, 3), + ] + return pharmacophore - def _generate_pharmacophore_vectors(self, interactions): - """ - Generates pharmacophore points for interactions that are vectors such as hydrogen bond donors or acceptors. - - Parameters - ---------- - interactions : list of str - List of interactions to generate pharmacophore vectors from. - Returns - ------- - dict - Dict of interactions from which pharmacophore is generated as key and list of - coordinates as value (first coords are ligand side, second are protein side). - """ - coord_pattern = re.compile(r"\(([\d.-]+), ([\d.-]+), ([\d.-]+)\)") + def _generate_pharmacophore_vectors(self, interactions): pharmacophore = {} + for interaction in interactions: + if interaction not in self.df_all.columns: + continue + df_hits = self.df_all[self.df_all[interaction] == 1] + if df_hits.empty: + continue + counter = 0 - sum_x, sum_y, sum_z = 0, 0, 0 - sum_a, sum_b, sum_c = 0, 0, 0 - for index, row in self.df_all.iterrows(): - if row[interaction] == 1: - coord_match = coord_pattern.match(row["LIGCOO"]) - if coord_match: - x, y, z = map(float, coord_match.groups()) - sum_x += x - sum_y += y - sum_z += z - coord_match = coord_pattern.match(row["PROTCOO"]) - if coord_match: - a, b, c = map(float, coord_match.groups()) - sum_a += a - sum_b += b - sum_c += c - counter += 1 - - center_x = round((sum_x / counter), 3) - center_y = round((sum_y / counter), 3) - center_z = round((sum_z / counter), 3) - center_a = round((sum_a / counter), 3) - center_b = round((sum_b / counter), 3) - center_c = round((sum_c / counter), 3) + sum_x = sum_y = sum_z = 0.0 + sum_a = sum_b = sum_c = 0.0 + + for _, row in df_hits.iterrows(): + lig_m = COORD_RE.search(str(row.get("LIGCOO", ""))) + prot_m = COORD_RE.search(str(row.get("PROTCOO", ""))) + + # Only count frames where BOTH ends are available + if not lig_m or not prot_m: + continue + + x, y, z = map(float, lig_m.groups()) + a, b, c = map(float, prot_m.groups()) + + sum_x += x + sum_y += y + sum_z += z + sum_a += a + sum_b += b + sum_c += c + counter += 1 + + if counter == 0: + continue + pharmacophore[interaction] = [ - [center_x, center_y, center_z], - [center_a, center_b, center_c], + [round(sum_x / counter, 3), round(sum_y / counter, 3), round(sum_z / counter, 3)], + [round(sum_a / counter, 3), round(sum_b / counter, 3), round(sum_c / counter, 3)], ] + return pharmacophore + def _generate_pharmacophore_centers_all_points(self, interactions): """ Generates pharmacophore points for all interactions to generate point cloud diff --git a/openmmdl/tests/openmmdl_analysis/analysis/test_bindingmodes.py b/openmmdl/tests/openmmdl_analysis/analysis/test_bindingmodes.py new file mode 100644 index 00000000..13d3c78f --- /dev/null +++ b/openmmdl/tests/openmmdl_analysis/analysis/test_bindingmodes.py @@ -0,0 +1,107 @@ +import pandas as pd +import pytest + +from openmmdl.openmmdl_analysis.analysis.bindingmodes import BindingModeProcesser + + +@pytest.fixture +def sample_interaction_list(): + """Minimal interaction_list DataFrame for BindingModeProcesser.""" + return pd.DataFrame( + [ + # Two hbonds that share the same ligand atom index (ACCEPTORIDX=5) + { + "FRAME": 1, + "INTERACTION": "hbond", + "Prot_partner": "12GLYA", + "PROTISDON": True, + "ACCEPTORIDX": 5, + "DONORIDX": 0, + }, + { + "FRAME": 2, + "INTERACTION": "hbond", + "Prot_partner": "13SERA", + "PROTISDON": True, + "ACCEPTORIDX": 5, + "DONORIDX": 0, + }, + # One hydrophobic contact on ligand carbon 7 + { + "FRAME": 3, + "INTERACTION": "hydrophobic", + "Prot_partner": "14LEUA", + "LIGCARBONIDX": 7, + }, + ] + ) + + +def test_schema_ligand_aggregates_by_ligand_atom(sample_interaction_list): + """schema='ligand' covers all residues together.""" + bm_lig = BindingModeProcesser( + pdb_md=None, + ligand="LIG", + peptide=None, + special=None, + ligand_rings=None, + interaction_list=sample_interaction_list.copy(deep=True), + threshold=0, # include all interactions for this test + total_frames=10, + schema="ligand", + ) + + # Two hbonds share ACCEPTORIDX=5 => ONE ligand-centric column + assert "ligand_5_Acceptor_hbond" in bm_lig.unique_data + hbond_rows = bm_lig.interaction_list[bm_lig.interaction_list["INTERACTION"] == "hbond"] + assert (hbond_rows["ligand_5_Acceptor_hbond"] == 1).all() + + # Hydrophobic should also be ligand-centric + assert "ligand_7_hydrophobic" in bm_lig.unique_data + hydro_rows = bm_lig.interaction_list[bm_lig.interaction_list["INTERACTION"] == "hydrophobic"] + assert (hydro_rows["ligand_7_hydrophobic"] == 1).all() + + +def test_schema_residue_keeps_residue_specificity(sample_interaction_list): + """schema='residue' covers each residues individually.""" + bm_res = BindingModeProcesser( + pdb_md=None, + ligand="LIG", + peptide=None, + special=None, + ligand_rings=None, + interaction_list=sample_interaction_list.copy(deep=True), + threshold=0, + total_frames=10, + schema="residue", + ) + + # Residue schema produces two distinct columns for the two different residues + assert "12GLYA_5_Acceptor_hbond" in bm_res.unique_data + assert "13SERA_5_Acceptor_hbond" in bm_res.unique_data + + # Must NOT collapse to ligand_* naming + assert "ligand_5_Acceptor_hbond" not in bm_res.unique_data + + +def test_interactions_all_includes_rare_contacts_for_ligand_schema(sample_interaction_list): + """interactions_all/unique_data_all must include all observed interactions regardless of threshold.""" + bm_lig_high_thresh = BindingModeProcesser( + pdb_md=None, + ligand="LIG", + peptide=None, + special=None, + ligand_rings=None, + interaction_list=sample_interaction_list.copy(deep=True), + threshold=50, # 50% of total_frames=10 => min count 5 (none of these pass) + total_frames=10, + schema="ligand", + ) + + # High threshold filters these out of the "main" set + assert "ligand_5_Acceptor_hbond" not in bm_lig_high_thresh.unique_data + assert "ligand_7_hydrophobic" not in bm_lig_high_thresh.unique_data + + # But "all interactions" must still contain them + assert "ligand_5_Acceptor_hbond" in bm_lig_high_thresh.unique_data_all + assert "ligand_7_hydrophobic" in bm_lig_high_thresh.unique_data_all