-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
51 lines (44 loc) · 1.3 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import json
from transformers import AutoTokenizer
from datasets import Dataset
# Charger les données JSON
with open("wiki_api_dataset.json", "r", encoding="utf-8") as f:
data = json.load(f)
# Transformer en dataset compatible HuggingFace
texts = [article["text"] for article in data]
# Utiliser le tokenizer GPT-2 français
tokenizer = AutoTokenizer.from_pretrained(
"dbddv01/gpt2-french-small",
padding_side="left",
truncation_side="left"
)
# Configurer les tokens spéciaux
special_tokens = {
"pad_token": "<|pad|>",
"bos_token": "<|bos|>",
"eos_token": "<|eos|>",
}
tokenizer.add_special_tokens(special_tokens)
# Tokenizer les textes
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=512,
return_special_tokens_mask=True,
add_special_tokens=True
)
# Créer et processer le dataset
dataset = Dataset.from_dict({"text": texts})
tokenized_datasets = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names,
desc="Tokenizing texts"
)
# Sauvegarder
tokenized_datasets.save_to_disk("tokenized_dataset")
print("✅ Données tokenisées et sauvegardées !")
tokenizer.save_pretrained("trained_llm")
print("✅ Tokenizer sauvegardé !")