From 3727328752687df489310de2ad3d9a7a270a929f Mon Sep 17 00:00:00 2001 From: wiILIL <975202246@qq.com> Date: Sat, 1 Nov 2025 22:35:10 +0800 Subject: [PATCH] Tsds_selector --- docs/.vuepress/notes/en/guide.ts | 1 + docs/.vuepress/notes/zh/guide.ts | 1 + docs/en/notes/guide/selector/selector_tsds.md | 263 +++++++++++++++++ docs/zh/notes/guide/selector/selector_tsds.md | 264 ++++++++++++++++++ 4 files changed, 529 insertions(+) create mode 100644 docs/en/notes/guide/selector/selector_tsds.md create mode 100644 docs/zh/notes/guide/selector/selector_tsds.md diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index c2f3455..6cbe8e1 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -25,6 +25,7 @@ export const Guide: ThemeNote = defineNoteConfig({ 'quickstart', 'tutorial', 'selector_less', + 'selector_tsds', ], }, { diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index a393fce..b59d929 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -25,6 +25,7 @@ export const Guide: ThemeNote = defineNoteConfig({ 'quickstart', 'tutorial', 'selector_less', + 'selector_tsds', ], }, { diff --git a/docs/en/notes/guide/selector/selector_tsds.md b/docs/en/notes/guide/selector/selector_tsds.md new file mode 100644 index 0000000..807ff23 --- /dev/null +++ b/docs/en/notes/guide/selector/selector_tsds.md @@ -0,0 +1,263 @@ +--- +title: selector_tsds +createTime: 2025/11/01 21:36:21 +permalink: /en/guide/im5q9cd2/ +icon: tdesign:cat +--- + + +# TSDS Selector Guide + +This document explains how to use the **TSDS Selector** (Data Selection for Task‑Specific Model Finetuning) in the **DataFlex** framework to perform **dynamic training data selection** during supervised finetuning (SFT), balancing **representative density** and **topological diversity** to improve generalization. + +--- + +## 1. Method Overview + +The core idea of **TSDS** is: + +* Further encode **already tokenized** samples into **sentence embeddings** (e.g., 512‑dim). +* Perform **nearest‑neighbor search & kernel density estimation (KDE)** in the embedding space to obtain each sample’s representativeness score. +* Incorporate **topological diversity** (avoid only picking clusters), and trade off density vs. diversity via the coefficient `alpha`. + +> Intuition: **Higher density** ⇒ more “typical/representative” samples; **higher diversity** ⇒ broader coverage and less redundancy. + +### Scoring Formulation + +Let the sentence embedding of a sample be $e_i$, and let its $K$ nearest neighbors be $\mathcal{N}_K(i)$. + + +1. **Kernel Density Estimation (KDE):** + $$ + \text{density}(i) + = \frac{1}{K} \sum_{j\in \mathcal{N}_K(i)} + \exp!\left(-\frac{\lVert e_i - e_j \rVert^2}{2\sigma^2}\right) + $$ + +2. **Diversity (simple implementation via de‑dup penalty / marginal gain):** + $$ + \text{diversity}(i)\ \propto + \min_{j\in S} \lVert e_i - e_j \rVert,\quad + S=\text{selected set} + $$ + +3. **Combined Score:** + $$ + \text{score}(i) + = \alpha, \text{density}(i) + + * (1-\alpha), \text{diversity}(i) + $$ + +> In practice, `kde_K` (neighbors used by KDE) and `max_K` (overall NN search limit) can differ. `C` can be used as a selection ratio/threshold or other control term depending on the implementation. + +--- + +## 2. Environment & Dependencies + +```bash +# DataFlex (recommended: editable install) +git clone https://github.com/OpenDCAI/DataFlex.git +cd DataFlex +pip install -e . + +# Common training/inference dependencies (as needed) +pip install llamafactory + +# TSDS extras (vector search & progress bars) +pip install faiss-cpu tqdm +``` + +--- + +## 3. Selector Registration & Initialization + +Register a custom TSDS selector component: + +```python +from dataflex.selectors import Selector, register_selector + +@register_selector("tsds") +class TsdsSelector(Selector): + """Topological & Statistical Density Selector""" + def __init__( + self, + dataset, + eval_dataset, + accelerator, + data_collator, + cache_dir, + seed: int = 42, + max_K: int = 128, + kde_K: int = 64, + sigma: float = 1.0, + alpha: float = 0.5, + C: float = 10.0, + sample_size: int = 1000, + model_name: str = "/home/lianghao/yry/TSDS/bert_chinese" # sentence encoder path + ): + super().__init__(dataset, accelerator, data_collator, cache_dir) + +``` +**TODO: Replace `model_name` with your local encoder path. Using the placeholder will raise an error.** + +> **Note:** `model_name` is used to encode **tokenized** samples into **sentence embeddings** (e.g., 512‑dim). Common choices include BERT/USE/SimCSE‑style encoders. + +--- + +## 4. Key Hyperparameters & Tips + +| Parameter | Typical Range | Meaning & Tips | +| ------------- | ------------- | --------------------------------------------------------------------------------------------- | +| `max_K` | 64–256 | Upper bound of NN retrieval. Larger = stabler but more costly; balance with data size & VRAM. | +| `kde_K` | 16–64 | #neighbors in KDE. Smaller = more sensitive; larger = smoother. Usually `kde_K ≤ max_K`. | +| `sigma` | 0.5–2.0 | KDE bandwidth. Too small ⇒ noisy; too large ⇒ oversmoothing. | +| `alpha` | 0.3–0.7 | Trade‑off between representativeness (density) and coverage (diversity). | +| `C` | 0.01–1.0 | Selection ratio/threshold or regularization strength depending on implementation. | +| `sample_size` | 500–5000 | Candidate pool size per selection step; heavily impacts speed & quality. | +| `model_name` | — | Path/name of the sentence encoder (local BERT/USE/SimCSE, etc.). | +| `cache_dir` | — | Cache directory for intermediate artifacts and resume‑from‑cache. | + +--- + +## 5. Component Config (`components.yaml`) + +**Path:** `DataFlex/src/dataflex/configs/components.yaml` + +**Preset example** + +```yaml +tsds: + name: tsds + params: + max_K: 128 + kde_K: 64 + sigma: 0.8 + alpha: 0.5 + C: 10.0 + model_name: "/home/lianghao/yry/TSDS/bert_chinese" + cache_dir: ../dataflex_saves/tsds_output +``` + +--- + +## 6. Dynamic Training Config (LoRA + TSDS) + +**Example file:** `DataFlex/examples/train_lora/selectors/tsds.yaml` + +```yaml +### model +model_name_or_path: /home/lianghao/yry/LLaMA-Factory/Qwen2.5-0.5B-Instruct +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: all +lora_rank: 16 +lora_alpha: 8 +# deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json] + +### dataset +dataset: alpaca_en_demo +template: qwen +cutoff_len: 4096 +# max_samples: 100000000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 0 +# disable_shuffling: true +seed: 42 + +### output +output_dir: ../dataflex_saves/qwen/tsds +logging_steps: 10 +save_steps: 100 +plot_loss: true +save_only_model: false +overwrite_output_dir: true + +### swanlab +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] +# use_swanlab: true +# swanlab_project: medical_dynamic_sft +# swanlab_run_name: qwen2_5_3b_lora_medical_50k_baseline +# swanlab_workspace: word2li +# swanlab_api_key: +# swanlab_lark_webhook_url: +# swanlab_lark_secret: + +### train +per_device_train_batch_size: 2 +gradient_accumulation_steps: 16 +learning_rate: 1.0e-4 +num_train_epochs: 1.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: false + +### Dataflex args +train_type: dynamic_select # trainer type: + # "dynamic_select" | "dynamic_mix" | "dynamic_weight" | "static" +components_cfg_file: src/dataflex/configs/components.yaml +component_name: tsds # must match the name in components_cfg_file +warmup_step: 400 +update_step: 500 +update_times: 2 +# eval_dataset: alpaca_zh_demo +eval_dataset: alpaca_zh_demo +``` + +**Notes:** + +* `component_name: tsds` enables the TSDS component. +* `warmup_step / update_step / update_times` decide **when** and **how often** to re‑select the training subset; total steps ≈ `warmup_step + update_step × update_times`. +* `eval_dataset` provides the **target distribution** reference for similarity/representativeness scoring. + +--- + +## 7. Run Training + +```bash +FORCE_TORCHRUN=0 DISABLE_VERSION_CHECK=1 dataflex-cli train examples/train_lora/selectors/tsds.yaml +``` + +**Note:** the above example runs without distributed launch. + +During training, TSDS is triggered at scheduled steps: encode training samples → NN search / KDE → combine with diversity → select the next training subset. + +--- + +## 8. Merge & Export the Model + +Same as the Less Selector pipeline. + +**Config file:** `DataFlex/examples/merge_lora/llama3_lora_sft.yaml` + +```yaml +model_name_or_path: +adapter_name_or_path: +template: qwen +trust_remote_code: true + +export_dir: ../dataflex_saves/Qwen2.5-0.5B_lora_sft +export_size: 5 +export_device: cpu +export_legacy_format: false +``` + +Run the export command (inside the LLaMA‑Factory directory): + +```bash +llamafactory-cli export llama3_lora_sft.yaml +``` + +--- + +## 9. Evaluation & Comparison + +We recommend using the [DataFlow](https://github.com/OpenDCAI/DataFlow) QA evaluation pipeline to compare **TSDS** against **Less** and **random sampling**. + + diff --git a/docs/zh/notes/guide/selector/selector_tsds.md b/docs/zh/notes/guide/selector/selector_tsds.md new file mode 100644 index 0000000..a1e65e9 --- /dev/null +++ b/docs/zh/notes/guide/selector/selector_tsds.md @@ -0,0 +1,264 @@ +--- +title: Tsds 数据选择器 +createTime: 2025/11/01 21:35:45 +permalink: /zh/guide/vkqfowej/ +icon: tdesign:cat + +--- + + +# TSDS Selector 使用介绍 + +本文档介绍如何在 **DataFlex** 框架中使用 **TSDS Selector** Data Selection for Task-Specific Model Finetuning实现训练数据的**动态选择**,以在监督微调(SFT)中兼顾**密度代表性**与**多样性**,提升泛化效果。 + +--- + +## 1. 方法概述 + +**TSDS** 的核心思想是: + +* 先将**已分词(tokenized)**的样本进一步编码为**句向量**(例如 512 维)。 +* 在嵌入空间中进行**近邻搜索 & 密度估计(KDE)**,得到每个样本的“代表性分数”。 +* 同时考虑**拓扑多样性**(避免只挑“挤在一起”的样本),在密度与多样性之间用系数 `alpha` 做权衡。 + +> 直观理解:密度高 = 更“典型/代表”的数据, +> 多样性高 = 覆盖面更广、减少信息冗余。 + +### 评分构成 + +设样本的句向量为 $e_i$,其 $K$ 个近邻集合为 $\mathcal{N}_K(i)$。 + +1. **核密度估计(KDE)**: +$$ +\text{density}(i) += \frac{1}{K} \sum_{j\in \mathcal{N}_K(i)} +\exp\!\left(-\frac{\lVert e_i - e_j \rVert^2}{2\sigma^2}\right) +$$ + +2. **多样性(简单实现可用去冗余惩罚/边际增益)**: +$$ +\text{diversity}(i)\ \propto\ +\min_{j\in S} \lVert e_i - e_j \rVert,\quad +S=\text{已选集合} +$$ + +3. **综合评分**: +$$ +\text{score}(i) += \alpha\, \text{density}(i) ++ (1-\alpha)\, \text{diversity}(i) +$$ + +> 实际实现中,`kde_K`(用于密度估计的近邻数)与 `max_K`(总检索近邻上限)可不同;`C` 可作为筛选比例/阈值等控制量。 + +--- + +## 2. 环境与依赖 + +```bash +# DataFlex(建议源码安装) +git clone https://github.com/OpenDCAI/DataFlex.git +cd DataFlex +pip install -e . + +# 训练与推理的常用依赖(按需) +pip install llamafactory + +# TSDS 额外依赖(向量检索与进度条等) +pip install faiss-cpu tqdm +``` + +--- + +## 3. 选择器注册与初始化示例 + +在自定义组件中注册 TSDS 选择器: + +```python +from dataflex.selectors import Selector, register_selector + +@register_selector("tsds") +class TsdsSelector(Selector): + """Topological & Statistical Density Selector""" + def __init__( + self, + dataset, + eval_dataset, + accelerator, + data_collator, + cache_dir, + seed: int = 42, + max_K: int = 128, + kde_K: int = 64, + sigma: float = 1.0, + alpha: float = 0.5, + C: float = 10.0, + sample_size: int = 1000, + model_name: str = "/home/lianghao/yry/TSDS/bert_chinese" # 句向量编码模型 + ): + super().__init__(dataset, accelerator, data_collator, cache_dir) + +``` + ** TODO: 将模型名字修改成自己本地模型,否则会默认原地址引发报错 ** + +> **注意**:此处的 `model_name` 用于将**tokenized**后的文本进一步编码为**句向量**(例如 512 维),常见选择是 BERT/USE 等句向量模型。 + +--- + +## 4. 关键超参数与建议 + +| 参数 | 典型范围 | 含义与建议 | +| ------------- | -------- | ----------------------------------------- | +| `max_K` | 64–256 | 近邻检索数量上限,越大越稳但开销更高;建议与数据规模/显存权衡 | +| `kde_K` | 16–64 | 用于密度估计的邻居数,越小更敏感、越大更平滑;通常 `kde_K ≤ max_K` | +| `sigma` | 0.5–2.0 | KDE 的核宽度,过小噪声大,过大易过平滑 | +| `alpha` | 0.3–0.7 | 密度 vs 多样性的权衡系数,靠 1 偏重代表性,靠 0 偏重覆盖度 | +| `C` | 0.01–1.0 | 用作筛选比例/阈值/正则系数等控制量;与实现细节相关 | +| `sample_size` | 500–5000 | 每次候选评估的样本数上限;大幅影响速度与效果 | +| `model_name` | — | 句向量编码模型路径或名称(如本地 BERT/USE) | +| `cache_dir` | — | 中间结果缓存路径,便于断点续跑 | + +--- + +## 5. 组件配置(components.yaml) + +**路径:** `DataFlex/src/dataflex/configs/components.yaml` + +**预设参数** + +```yaml +tsds: + name: tsds + params: + max_K: 128 + kde_K: 64 + sigma: 0.8 + alpha: 0.5 + C: 10.0 + model_name: "/home/lianghao/yry/TSDS/bert_chinese" + cache_dir: ../dataflex_saves/tsds_output +``` + +--- + +## 6. 动态训练配置(LoRA + TSDS) + +**示例文件:** `DataFlex/examples/train_lora/selectors/tsds.yaml` + +```yaml +### model +model_name_or_path: /home/lianghao/yry/LLaMA-Factory/Qwen2.5-0.5B-Instruct +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: all +lora_rank: 16 +lora_alpha: 8 +# deepspeed: examples/deepspeed/ds_z3_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json] + +### dataset +dataset: alpaca_en_demo +template: qwen +cutoff_len: 4096 +# max_samples: 100000000 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 0 +# disable_shuffling: true +seed: 42 + +### output +output_dir: ../dataflex_saves/qwen/tsds +logging_steps: 10 +save_steps: 100 +plot_loss: true +save_only_model: false +overwrite_output_dir: true + +### swanlab +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] +# use_swanlab: true +# swanlab_project: medical_dynamic_sft +# swanlab_run_name: qwen2_5_3b_lora_medical_50k_baseline +# swanlab_workspace: word2li +# swanlab_api_key: AnLWTMijcbd4cyEfundi3 +# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/ff10a391-4e51-4481-97ff-965760cae2a1 +# swanlab_lark_secret: cySzwTbCJh08349FGAhBSf + +### train +per_device_train_batch_size: 2 +gradient_accumulation_steps: 16 +learning_rate: 1.0e-4 +num_train_epochs: 1.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: false + +### Dataflex args +train_type: dynamic_select # 选择训练器类型。可选值包括: + # "dynamic_select" - 动态选择训练器 + # "dynamic_mix" - 动态混合训练器 + # "dynamic_weight" - 动态加权训练器 + # "static" - 默认静态训练器 +components_cfg_file: src/dataflex/configs/components.yaml +component_name: tsds # 选择组件名称,对应 components_cfg_file 中定义的组件 +warmup_step: 400 +update_step: 500 +update_times: 2 +# eval_dataset: alpaca_zh_demo +eval_dataset: alpaca_zh_demo + +``` + +**参数说明:** + +* `component_name: tsds`:启用 TSDS 组件。 +* `warmup_step / update_step / update_times`:决定**何时**与**多久**进行一次动态选择;总步数 ≈ `warmup_step + update_step × update_times`。 +* `eval_dataset`:为 TSDS 提供“目标分布”的参考(决定相似度/代表性评估的方向)。 + +--- + +## 7. 运行训练 + +```bash +FORCE_TORCHRUN=0 DISABLE_VERSION_CHECK=1 dataflex-cli train examples/train_lora/selectors/tsds.yaml +``` +**不用采用分布式** + +训练过程中会在设定的步数触发 TSDS 动态选择:编码训练样本 → 近邻检索/密度估计 → 结合多样性打分 → 选出下一阶段训练子集。 + +--- + +## 8. 模型合并与导出 + +与 Less Selector 流程一致: + +**配置文件:** `DataFlex/examples/merge_lora/llama3_lora_sft.yaml` + +```yaml +model_name_or_path: 原模型地址 +adapter_name_or_path: 微调后adpter地址 +template: qwen +trust_remote_code: true + +export_dir: ../dataflex_saves/Qwen2.5-0.5B_lora_sft +export_size: 5 +export_device: cpu +export_legacy_format: false +``` + +导出命令: +在llamafactory文件夹中运行 +```bash +llamafactory-cli export llama3_lora_sft.yaml +``` + +--- + +## 9. 评估与对比 + +建议使用 [DataFlow](https://github.com/OpenDCAI/DataFlow) 的模型 QA 评估流水线,对 **TSDS** 与 **Less**、**随机采样** 等策略进行并列评测 \ No newline at end of file