prompt_risk/run.sh at master · thomaspzollo/prompt_risk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# main script for running all experiments in succession
NUM_GPUS=4

# full_chat tests
# embed
echo "Embedding full_chat"
python -u -m scripts.generate_outputs \
    --datasets full_chat \
    --model-name-or-path sentence-transformers/multi-qa-mpnet-base-dot-v1 \
    --num-gpus $NUM_GPUS \
    --n-total 5000 \
    --batch-size 1000 \
    --seed 42 \
    --embed

# generate with flan-t5
echo "Generating full_chat with flan-t5-xxl"
python -u -m scripts.generate_outputs \
    --datasets full_chat \
    --model-name-or-path google/flan-t5-xxl \
    --num-gpus $NUM_GPUS \
    --print-container-logs \
    --n-total 2000 \
    --num-hypotheses 50 \
    --seed 42

# eval full_chat
echo "Evaluating full_chat"
python -u -m scripts.compute_loss \
    --output-dir output \
    --datasets full_chat \
    --loss-fn weqweasdas/hh_rlhf_rm_open_llama_3b \
    --batch-size 5 \
    --eval-models google/flan-t5-xxl

# red_team_chat tests
# embed
echo "Embedding red_team_chat"
python -u -m scripts.generate_outputs \
    --datasets red_team_chat \
    --model-name-or-path sentence-transformers/multi-qa-mpnet-base-dot-v1 \
    --num-gpus $NUM_GPUS \
    --n-total 5000 \
    --batch-size 1000 \
    --seed 42 \
    --embed

# generate with flan-t5
echo "Generating red_team_chat with flan-t5-xxl"
python -u -m scripts.generate_outputs \
    --datasets red_team_chat \
    --model-name-or-path google/flan-t5-xxl \
    --num-gpus $NUM_GPUS \
    --print-container-logs \
    --n-total 2000 \
    --num-hypotheses 50 \
    --seed 42

# eval red_team_chat
echo "Evaluating red_team_chat"
python -u -m scripts.compute_loss \
    --output-dir output \
    --datasets red_team_chat \
    --loss-fn weqweasdas/hh_rlhf_rm_open_llama_3b \
    --batch-size 5 \
    --eval-models google/flan-t5-xxl

# cnn_dailymail tests
# embed
echo "Embedding cnn_dailymail"
python -u -m scripts.generate_outputs \
    --datasets cnn_dailymail \
    --model-name-or-path sentence-transformers/multi-qa-mpnet-base-dot-v1 \
    --num-gpus $NUM_GPUS \
    --n-total 5000 \
    --batch-size 200 \
    --seed 42 \
    --embed

# generate with llama 2
echo "Generating cnn_dailymail with meta-llama/Llama-2-7b-chat-hf"
python -u -m scripts.generate_outputs \
    --datasets cnn_dailymail \
    --model-name-or-path meta-llama/Llama-2-7b-chat-hf \
    --num-gpus $NUM_GPUS \
    --print-container-logs \
    --n-total 2000 \
    --num-hypotheses 40 \
    --seed 42

# eval cnn_dailymail
echo "Evaluating cnn_dailymail"
python -u -m scripts.compute_loss \
    --output-dir output \
    --datasets cnn_dailymail \
    --loss-fn rouge

# xsum tests
# embed
echo "Embedding xsum"
python -u -m scripts.generate_outputs \
    --datasets xsum \
    --model-name-or-path sentence-transformers/multi-qa-mpnet-base-dot-v1 \
    --num-gpus $NUM_GPUS \
    --n-total 5000 \
    --batch-size 200 \
    --seed 42 \
    --embed

# generate with llama 2
echo "Generating xsum with meta-llama/Llama-2-7b-chat-hf"
python -u -m scripts.generate_outputs \
    --datasets xsum \
    --model-name-or-path meta-llama/Llama-2-7b-chat-hf \
    --num-gpus $NUM_GPUS \
    --print-container-logs \
    --n-total 2000 \
    --num-hypotheses 40 \
    --seed 42

# eval xsum
echo "Evaluating xsum"
python -u -m scripts.compute_loss \
    --output-dir output \
    --datasets xsum \
    --loss-fn rouge

echo "Generating meqsum with tiiuae/falcon-40b-instruct"
python -u -m scripts.generate_outputs \
    --datasets bigbio/meqsum \
    --model-name-or-path tiiuae/falcon-40b-instruct \
    --num-gpus $NUM_GPUS \
    --print-container-logs \
    --n-total 2000 \
    --num-hypotheses 50 \
    --seed 42

# meqsum 7b (this only runs on a single GPU, run on a smaller server or do a better job of parallelizing)
echo "Generating meqsum with tiiuae/falcon-7b-instruct"
# have to use 1 gpu here: ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 71 and `num_shards`: 4) <-- who thought 71 was a good idea?
python -u -m scripts.generate_outputs \
    --datasets bigbio/meqsum \
    --model-name-or-path tiiuae/falcon-7b-instruct \
    --num-gpus 1 \
    --print-container-logs \
    --n-total 2000 \
    --num-hypotheses 50 \
    --seed 42

# eval meqsum
echo "Evaluating meqsum"
python -u -m scripts.compute_loss \
    --output-dir output \
    --datasets bigbio/meqsum \
    --loss-fn rouge

# mbpp tests
echo "Generating mbpp with codellama/CodeLlama-7b-Instruct-hf"
python -u -m scripts.generate_outputs \
    --datasets mbpp \
    --model-name-or-path codellama/CodeLlama-7b-Instruct-hf \
    --num-gpus $NUM_GPUS \
    --print-container-logs \
    --n-total 1000 \
    --num-hypotheses 40 \
    --num-return-sequences 10 \
    --seed 42 \
    --do-sample

# eval mbpp
echo "Evaluating mbpp"
python -u -m scripts.compute_loss \
    --output-dir output \
    --datasets mbpp \
    --loss-fn pass@k