Skip to content

Commit ff24c49

Browse files
author
jl4mc
committed
CASP16 system
1 parent 9907a18 commit ff24c49

18 files changed

+1830
-429
lines changed

gate/feature/align_models_by_sequence.py

+24-12
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def get_sequence_by_chain(chain_id, sequence_id_map):
124124
return sequence
125125
return ''
126126

127-
def merge_chain_pdbs(chain_mapping, outfile):
127+
def merge_chain_pdbs(chain_mapping, outfile, check_format=False):
128128
# reorder chains based on the stoichiometry
129129
# e.g., A2B2: AB CD
130130
chain_idx = 0
@@ -143,25 +143,36 @@ def merge_chain_pdbs(chain_mapping, outfile):
143143
#print(contents)
144144
fw.write('\n'.join(contents))
145145

146+
if check_format:
147+
parser = PDBParser(QUIET=True)
148+
try:
149+
structure2 = parser.get_structure('', outfile)
150+
except Exception as e:
151+
os.system("rm " + outfile)
152+
146153

147154
def filter_single_model(inparams):
148155

149-
clustalw_program, sequence_id_map, inpdb, pdbdir, outpdb = inparams
156+
clustalw_program, sequence_id_map, inpdb, pdbdir, outpdb, check_format = inparams
150157

151158
# print(f"Filtering {inpdb}")
152159

153160
# get chain mapping from pdb to fasta file
154-
chain_mapping = get_chain_mapping(clustalw_program=clustalw_program,
155-
sequence_id_map=sequence_id_map,
156-
inpdb=inpdb,
157-
pdbdir=pdbdir)
158-
# print(chain_mapping)
159-
merge_chain_pdbs(chain_mapping, outpdb)
161+
try:
162+
chain_mapping = get_chain_mapping(clustalw_program=clustalw_program,
163+
sequence_id_map=sequence_id_map,
164+
inpdb=inpdb,
165+
pdbdir=pdbdir)
166+
# print(chain_mapping)
167+
merge_chain_pdbs(chain_mapping, outpdb, check_format)
168+
except Exception as e:
169+
print(f"Filtering {inpdb} failed!")
170+
print(e)
160171

161172
os.system(f"rm -rf {pdbdir}")
162173

163174

164-
def align_models(clustalw_program, fasta_path, outdir, input_model_dir):
175+
def align_models(clustalw_program, fasta_path, outdir, input_model_dir, check_format):
165176

166177
# read sequences from fasta file
167178
sequences, descriptions = parse_fasta(open(fasta_path).read())
@@ -193,7 +204,7 @@ def align_models(clustalw_program, fasta_path, outdir, input_model_dir):
193204

194205
makedir_if_not_exists(workdir)
195206

196-
process_list.append([clustalw_program, sequence_id_map, input_model_dir + '/' + model, workdir, outdir + '/' + model.replace('.pdb', '')])
207+
process_list.append([clustalw_program, sequence_id_map, input_model_dir + '/' + model, workdir, outdir + '/' + model.replace('.pdb', ''), check_format])
197208

198209
pool = Pool(processes=40)
199210
results = pool.map(filter_single_model, process_list)
@@ -208,8 +219,9 @@ def align_models(clustalw_program, fasta_path, outdir, input_model_dir):
208219
parser.add_argument('--outdir', type=str, required=True)
209220
parser.add_argument('--modeldir', type=str, required=True)
210221
parser.add_argument('--clustalw_program', type=str, required=True)
211-
222+
parser.add_argument('--check_format', default=False, type=lambda x: (str(x).lower() == 'true'))
223+
212224
args = parser.parse_args()
213225

214-
align_models(args.clustalw_program, args.fasta_path, args.outdir, args.modeldir)
226+
align_models(args.clustalw_program, args.fasta_path, args.outdir, args.modeldir, args.check_format)
215227

gate/feature/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
'interface_pairwise_ts_script': os.path.join(ROOTDIR, 'gate', 'feature', 'interface_pairwise_ts.py'),
1818

1919
'icps_script': os.path.join(ROOTDIR, 'gate', 'feature', 'generate_icps_scores.py'),
20+
'model_size_script': os.path.join(ROOTDIR, 'gate', 'feature', 'generate_model_size.py'),
2021
'plddt_script': os.path.join(ROOTDIR, 'gate', 'feature', 'generate_plddt_scores.py'),
2122
'enqa_script': os.path.join(ROOTDIR, 'gate', 'feature', 'generate_enqa_scores.py'),
2223
'dproqa_script': os.path.join(ROOTDIR, 'gate', 'feature', 'generate_dproqa_scores.py'),

gate/feature/feature_generation.py

+467-40
Large diffs are not rendered by default.

gate/feature/generate_enqa_scores.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def generate_enqa_scores(enqa_env_path:str,
3939
model_size_ratio = dict(zip(list(model_info_df['model']), list(model_info_df['model_size_norm'])))
4040
target_dict['score_norm'] = []
4141

42-
max_length_threshold = 2500
42+
max_length_threshold = 2600
4343
# read sequences from fasta file
4444
sequences, descriptions = parse_fasta(open(fasta_path).read())
4545
target_length = np.sum(np.array([len(sequence) for sequence in sequences]))

gate/feature/generate_gcpnet_ema_scores.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def generate_gcpnet_scores(gcpnet_ema_env_path: str,
3434
f"data.ablate_esm_embeddings=false " \
3535
f"model.ablate_af2_plddt=false " \
3636
f"data.predict_output_dir={outdir}/workdir"
37-
37+
print(cmd)
3838
resultfile = f"{outdir}/result.csv"
3939

4040
if not os.path.exists(resultfile):
@@ -52,7 +52,7 @@ def generate_gcpnet_scores(gcpnet_ema_env_path: str,
5252
model_size_ratio = {}
5353
if model_csv is not None and os.path.exists(model_csv):
5454
model_info_df = pd.read_csv(model_csv)
55-
model_size_ratio = dict(zip(list(model_info_df['model']), list(model_info_df['model_size_norm'])))
55+
model_size_ratio = dict(zip([str(modelname) for modelname in list(model_info_df['model'])], list(model_info_df['model_size_norm'])))
5656
data_dict['score_norm'] = []
5757

5858
pred_model_out_dir = os.path.join(outdir, 'pred_pdbs')
@@ -66,7 +66,7 @@ def generate_gcpnet_scores(gcpnet_ema_env_path: str,
6666
data_dict['score'] += [global_score / 100]
6767

6868
if 'score_norm' in data_dict:
69-
data_dict['score_norm'] += [global_score / 100 * float(model_size_ratio[modelname])]
69+
data_dict['score_norm'] += [global_score / 100 * float(model_size_ratio[str(modelname)])]
7070

7171
os.system(f"cp {pred_model} {pred_model_out_dir}/{modelname}")
7272

gate/feature/generate_icps_scores.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,10 @@ def generate_icps_scores(fasta_path, outdir, pairwise_score_csv, input_model_dir
353353
pairwise_df = pd.read_csv(pairwise_score_csv, index_col=[0])
354354
models = pairwise_df.columns
355355
tmscores = np.array([np.mean(np.array(pairwise_df[model])) for model in models])
356+
# pairwise_df = pd.read_csv(pairwise_score_csv)
357+
# models = pairwise_df['model']
358+
# tmscores = pairwise_df['MMalign score']
359+
356360
chain_pdbs = {}
357361
while True:
358362
select_model_idx = np.argmax(tmscores)
@@ -432,7 +436,7 @@ def generate_icps_scores(fasta_path, outdir, pairwise_score_csv, input_model_dir
432436

433437
cal_list = [[cdpred_cmap_file, cmap_file] for cmap_file in cmap_files]
434438
# print(cal_list)
435-
pool = Pool(processes=60)
439+
pool = Pool(processes=150)
436440
results = pool.map(icps_recall_wrappeer, cal_list)
437441
pool.close()
438442
pool.join()

0 commit comments

Comments
 (0)