@@ -25,9 +25,7 @@ def tokenize_variable_length(
2525 tokenizer : PreTrainedTokenizerBase ,
2626 add_special_tokens : bool = True ,
2727) -> BatchEncoding :
28- tokenized = tokenizer (
29- data ["text" ], add_special_tokens = add_special_tokens , truncation = False
30- )
28+ tokenized = tokenizer (data ["text" ], add_special_tokens = add_special_tokens , truncation = False )
3129 return tokenized
3230
3331
@@ -102,10 +100,7 @@ def pack_sequences(
102100 output = {"input_ids" : packed_sequences }
103101 if add_labels :
104102 output ["labels" ] = [
105- [
106- LOSS_IGNORE_INDEX if token_id == pad_token_id else token_id
107- for token_id in example
108- ]
103+ [LOSS_IGNORE_INDEX if token_id == pad_token_id else token_id for token_id in example ]
109104 for example in output ["input_ids" ]
110105 ]
111106
@@ -201,18 +196,14 @@ def process_data(args: argparse.Namespace) -> None:
201196
202197
203198if __name__ == "__main__" :
204- parser = argparse .ArgumentParser (
205- description = "Pretokenize examples for finetuning via Together"
206- )
199+ parser = argparse .ArgumentParser (description = "Pretokenize examples for finetuning via Together" )
207200 parser .add_argument (
208201 "--dataset" ,
209202 type = str ,
210203 default = "clam004/antihallucination_dataset" ,
211204 help = "Dataset name on the Hugging Face Hub" ,
212205 )
213- parser .add_argument (
214- "--max-seq-length" , type = int , default = 8192 , help = "Maximum sequence length"
215- )
206+ parser .add_argument ("--max-seq-length" , type = int , default = 8192 , help = "Maximum sequence length" )
216207 parser .add_argument (
217208 "--add-labels" ,
218209 action = "store_true" ,
0 commit comments