8
8
from scicode .parse .parse import read_from_jsonl
9
9
10
10
11
- prob_num = 80
12
- step_num = 338
11
+ prob_num = 65
12
+ step_num = 288
13
13
14
14
logs_dir = 'eval/logs'
15
- dev_lst = [1 , 3 , 4 , 6 , 7 , 10 , 19 , 29 , 38 , 44 , 47 , 49 , 51 , 70 , 78 ]
16
- step_num_dev = 50
15
+ code_dir = 'eval_results / generated_code'
16
+ test_result_dir = 'test_result'
17
+
17
18
18
19
if not os .path .exists (logs_dir ):
19
20
os .makedirs (logs_dir )
31
32
def test_code (model_name ):
32
33
start_time = time .time ()
33
34
34
- code_dir = f'eval_results/generated_code /{ model_name } '
35
+ code_dir_ = f'{ code_dir } /{ model_name } '
35
36
tmp_dir = f'tmp_{ start_time } '
36
37
37
38
if not os .path .exists (tmp_dir ):
38
39
os .makedirs (tmp_dir )
39
40
40
- for root , _ , files in os .walk (code_dir ):
41
+ for root , _ , files in os .walk (code_dir_ ):
41
42
for file in files :
42
43
file_name = Path (file ).stem
43
44
file_id = file_name .split ("." )[0 ]
@@ -120,24 +121,16 @@ def run_script(script_path):
120
121
correct_prob_num = sum (1 for i in range (prob_num ) if
121
122
correct_prob [i ] == tot_prob [i ]
122
123
and tot_prob [i ] != 0 )
123
- correct_prob_num_dev = sum (1 for i in range (prob_num ) if
124
- correct_prob [i ] == tot_prob [i ] and
125
- tot_prob [i ] != 0 and
126
- (i + 1 ) not in dev_lst )
127
124
128
- correct_step_dev = [i for i in correct_step if int (i .split ('.' )[0 ]) not in dev_lst ]
129
-
130
- print (f'correct problems(include dev set): { correct_prob_num } /{ prob_num } ' )
131
- print (f'correct steps(include dev set): { len (correct_step )} /{ step_num } ' )
125
+ print (f'correct problems: { correct_prob_num } /{ prob_num } ' )
126
+ print (f'correct steps: { len (correct_step )} /{ step_num } ' )
132
127
133
128
if not os .path .exists (test_result_dir ):
134
129
os .makedirs (test_result_dir )
135
130
136
131
with open (f'{ test_result_dir } /{ model_name } .txt' , 'w' ) as f :
137
132
f .write (f'correct problems(include dev set): { correct_prob_num } /{ prob_num } \n ' )
138
133
f .write (f'correct steps(include dev set): { len (correct_step )} /{ step_num } \n \n ' )
139
- f .write (f'correct problems(exclude dev set): { correct_prob_num_dev } /{ prob_num - len (dev_lst )} \n ' )
140
- f .write (f'correct steps(exclude dev set): { len (correct_step_dev )} /{ step_num - step_num_dev } \n \n ' )
141
134
f .write (f'duration: { test_time } seconds\n ' )
142
135
f .write ('\n correct problems: ' )
143
136
f .write (f'\n \n { [i + 1 for i in range (prob_num ) if correct_prob [i ] == tot_prob [i ] and tot_prob [i ] != 0 ]} \n ' )
@@ -148,25 +141,7 @@ def run_script(script_path):
148
141
Path (tmp_dir ).rmdir ()
149
142
150
143
151
- def delete_all_files_in_folder (folder_path ):
152
- try :
153
- for filename in os .listdir (folder_path ):
154
- file_path = os .path .join (folder_path , filename )
155
- if os .path .isfile (file_path ):
156
- os .unlink (file_path )
157
- elif os .path .isdir (file_path ):
158
- delete_all_files_in_folder (file_path )
159
- os .rmdir (file_path )
160
- print (f"All files in { folder_path } have been deleted." )
161
- except Exception as e :
162
- print (f"Error: { e } " )
163
144
164
145
165
- test_result_dir = 'test_result'
166
- models = ['gpt-4o' , 'claude-3-5-sonnet-20240620' , 'gpt-4-turbo-2024-04-09' ,
167
- 'claude-3-sonnet-20240229' , 'claude-3-opus-20240229' , 'gemini-1.5-pro' ]
168
- llm_code = True
169
- bgs = [0 , 1 , 3 ]
170
- for m in models :
171
- for bg in bgs :
172
- test_code (m )
146
+ model = 'gpt-4o'
147
+ test_code (model )
0 commit comments