InternScience
diff --git a/‎.env.example‎
Lines changed: 24 additions & 2 deletions b/‎.env.example‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 39 additions & 10 deletions b/‎README.md‎
Lines changed: 39 additions & 10 deletions
diff --git a/‎README_zh.md‎
Lines changed: 43 additions & 17 deletions b/‎README_zh.md‎
Lines changed: 43 additions & 17 deletions
diff --git a/‎graphgen/bases/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎graphgen/bases/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphgen/bases/base_generator.py‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/bases/base_generator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphgen/bases/base_kg_builder.py‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/bases/base_kg_builder.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphgen/bases/base_llm_client.py‎ ‎graphgen/bases/base_llm_wrapper.py‎graphgen/bases/base_llm_client.py renamed to graphgen/bases/base_llm_wrapper.py
Lines changed: 7 additions & 1 deletion b/‎graphgen/bases/base_llm_client.py‎ ‎graphgen/bases/base_llm_wrapper.py‎graphgen/bases/base_llm_client.py renamed to graphgen/bases/base_llm_wrapper.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎graphgen/graphgen.py‎
Lines changed: 20 additions & 17 deletions b/‎graphgen/graphgen.py‎
Lines changed: 20 additions & 17 deletions
diff --git a/‎graphgen/models/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎graphgen/models/__init__.py‎
Lines changed: 1 addition & 2 deletions
@@ -1,7 +1,29 @@
+# Tokenizer
 TOKENIZER_MODEL=
-SYNTHESIZER_MODEL=
+
+# LLM
+# Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt
+
+# http_api / openai_api
+SYNTHESIZER_BACKEND=openai_api
+SYNTHESIZER_MODEL=gpt-4o-mini
 SYNTHESIZER_BASE_URL=
 SYNTHESIZER_API_KEY=
-TRAINEE_MODEL=
+TRAINEE_BACKEND=openai_api
+TRAINEE_MODEL=gpt-4o-mini
 TRAINEE_BASE_URL=
 TRAINEE_API_KEY=
+
+# # ollama_api
+# SYNTHESIZER_BACKEND=ollama_api
+# SYNTHESIZER_MODEL=gemma3
+# SYNTHESIZER_BASE_URL=http://localhost:11434
+#
+# Note: TRAINEE with ollama_api backend is not supported yet as ollama_api does not support logprobs.
+
+# # huggingface
+# SYNTHESIZER_BACKEND=huggingface
+# SYNTHESIZER_MODEL=Qwen/Qwen2.5-0.5B-Instruct
+#
+# TRAINEE_BACKEND=huggingface
+# TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
@@ -21,13 +21,14 @@
 
 GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation
 
-[English](README.md) | [中文](README_zh)
+[English](README.md) | [中文](README_zh.md)
 
 <details close>
 <summary><b>📚 Table of Contents</b></summary>
 
 - 📝 [What is GraphGen?](#-what-is-graphgen)
 - 📌 [Latest Updates](#-latest-updates)
+- ⚙️ [Support List](#-support-list)
 - 🚀 [Quick Start](#-quick-start)
 - 🏗️ [System Architecture](#-system-architecture)
 - 🍀 [Acknowledgements](#-acknowledgements)
@@ -47,13 +48,13 @@ GraphGen is a framework for synthetic data generation guided by knowledge graphs
 
 Here is post-training result which **over 50% SFT data** comes from GraphGen and our data clean pipeline.
 
-| Domain | Dataset | Ours | Qwen2.5-7B-Instruct (baseline)	|
-| :-: | :-: | :-: | :-: |
-| Plant| [SeedBench](https://github.com/open-sciencelab/SeedBench) | **65.9** | 51.5 |
-| Common | CMMLU | 73.6 | **75.8** |
-| Knowledge | GPQA-Diamond | **40.0** | 33.3 |
-| Math | AIME24 | **20.6** | 16.7 |
-| | AIME25 | **22.7** | 7.2 |
+|  Domain   |                          Dataset                          |   Ours   | Qwen2.5-7B-Instruct (baseline) |
+|:---------:|:---------------------------------------------------------:|:--------:|:------------------------------:|
+|   Plant   | [SeedBench](https://github.com/open-sciencelab/SeedBench) | **65.9** |              51.5              |
+|  Common   |                           CMMLU                           |   73.6   |            **75.8**            |
+| Knowledge |                       GPQA-Diamond                        | **40.0** |              33.3              |
+|   Math    |                          AIME24                           | **20.6** |              16.7              |
+|           |                          AIME25                           | **22.7** |              7.2               |
 
 It begins by constructing a fine-grained knowledge graph from the source text，then identifies knowledge gaps in LLMs using the expected calibration error metric, prioritizing the generation of QA pairs that target high-value, long-tail knowledge.
 Furthermore, GraphGen incorporates multi-hop neighborhood sampling to capture complex relational information and employs style-controlled generation to diversify the resulting QA data.
@@ -62,20 +63,48 @@ After data generation, you can use [LLaMA-Factory](https://github.com/hiyouga/LL
 
 ## 📌 Latest Updates
 
+- **2025.10.30**: We support several new LLM clients and inference backends including [Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) and [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py).
 - **2025.10.23**: We support VQA(Visual Question Answering) data generation now. Run script: `bash scripts/generate/generate_vqa.sh`.
 - **2025.10.21**: We support PDF as input format for data generation now via [MinerU](https://github.com/opendatalab/MinerU).
-- **2025.09.29**: We auto-update gradio demo on [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) and [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen).
 
 <details>
 <summary>History</summary>
 
+- **2025.09.29**: We auto-update gradio demo on [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) and [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen).
 - **2025.08.14**: We have added support for community detection in knowledge graphs using the Leiden algorithm, enabling the synthesis of Chain-of-Thought (CoT) data.
 - **2025.07.31**: We have added Google, Bing, Wikipedia, and UniProt as search back-ends.
 - **2025.04.21**: We have released the initial version of GraphGen.
 
 </details>
 
 
+## ⚙️ Support List
+
+We support various LLM inference servers, API servers, inference clients, input file formats, data modalities, output data formats, and output data types.
+Users can flexibly configure according to the needs of synthetic data.
+
+| Inference Server                             | Api Server                                                                     | Inference Client                                           | Input File Format                  | Data Modal    | Data Format                  | Data Type                                       |
+|----------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|------------------------------------|---------------|------------------------------|-------------------------------------------------|
+| [![hf-icon]HF][hf]<br>[![sg-icon]SGLang][sg] | [![sif-icon]Silicon][sif]<br>[![oai-icon]OpenAI][oai]<br>[![az-icon]Azure][az] | HTTP<br>[![ol-icon]Ollama][ol]<br>[![oai-icon]OpenAI][oai] | CSV<br>JSON<br>JSONL<br>PDF<br>TXT | TEXT<br>IMAGE | Alpaca<br>ChatML<br>Sharegpt | Aggregated<br>Atomic<br>CoT<br>Multi-hop<br>VQA |
+
+<!-- links -->
+[hf]: https://huggingface.co/docs/transformers/index
+[sg]: https://docs.sglang.ai
+[sif]: https://siliconflow.cn
+[oai]: https://openai.com
+[az]: https://azure.microsoft.com/en-us/services/cognitive-services/openai-service/
+[ol]: https://ollama.com
+
+<!-- icons -->
+[hf-icon]: https://www.google.com/s2/favicons?domain=https://huggingface.co
+[sg-icon]: https://www.google.com/s2/favicons?domain=https://docs.sglang.ai
+[sif-icon]: https://www.google.com/s2/favicons?domain=siliconflow.com
+[oai-icon]: https://www.google.com/s2/favicons?domain=https://openai.com
+[az-icon]: https://www.google.com/s2/favicons?domain=https://azure.microsoft.com
+[ol-icon]: https://www.google.com/s2/favicons?domain=https://ollama.com
+
+
+
 ## 🚀 Quick Start
 
 Experience GraphGen through [Web](https://g-app-center-120612-6433-jpdvmvp.openxlab.space) or [Backup Web Entrance](https://openxlab.org.cn/apps/detail/chenzihonga/GraphGen)
@@ -176,7 +205,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
    Pick the desired format and run the matching script:
 
    | Format       | Script to run                                  | Notes                                                             |
-   | ------------ | ---------------------------------------------- |-------------------------------------------------------------------|
+   |--------------|------------------------------------------------|-------------------------------------------------------------------|
    | `cot`        | `bash scripts/generate/generate_cot.sh`        | Chain-of-Thought Q\&A pairs                                       |
    | `atomic`     | `bash scripts/generate/generate_atomic.sh`     | Atomic Q\&A pairs covering basic knowledge                        |
    | `aggregated` | `bash scripts/generate/generate_aggregated.sh` | Aggregated Q\&A pairs incorporating complex, integrated knowledge |
 
@@ -20,19 +20,20 @@
 
 GraphGen: Enhancing Supervised Fine-Tuning for LLMs with Knowledge-Driven Synthetic Data Generation
 
-[English](README.md) | [中文](README_zh)
+[English](README.md) | [中文](README_zh.md)
 
 <details close>
 <summary><b>📚 目录</b></summary>
 
 - 📝 [什么是 GraphGen？](#-什么是-graphgen)
-- 📌 [最新更新](#最新更新)
-- 🚀 [快速开始](#快速开始)
-- 🏗️ [系统架构](#系统架构)
-- 🍀 [致谢](#致谢)
-- 📚 [引用](#引用)
-- 📜 [许可证](#许可证)
-- 📅 [星标历史](#星标历史)
+- 📌 [最新更新](#-最新更新)
+- ⚙️ [支持列表](#-支持列表)
+- 🚀 [快速开始](#-快速开始)
+- 🏗️ [系统架构](#-系统架构)
+- 🍀 [致谢](#-致谢)
+- 📚 [引用](#-引用)
+- 📜 [许可证](#-许可证)
+- 📅 [星标历史](#-星标历史)
 
 
 [//]: # (- 🌟 [主要特性](#主要特性))
@@ -48,34 +49,59 @@ GraphGen 是一个基于知识图谱的数据合成框架。请查看[**论文**
 
 以下是在超过 50 % 的 SFT 数据来自 GraphGen 及我们的数据清洗流程时的训练后结果：
 
-| 领域 | 数据集 | 我们的方案 | Qwen2.5-7B-Instruct（基线） |
-| :-: | :-: | :-: | :-: |
-| 植物 | [SeedBench](https://github.com/open-sciencelab/SeedBench) | **65.9** | 51.5 |
-| 常识 | CMMLU | 73.6 | **75.8** |
-| 知识 | GPQA-Diamond | **40.0** | 33.3 |
-| 数学 | AIME24 | **20.6** | 16.7 |
-| | AIME25 | **22.7** | 7.2 |
+| 领域 |                            数据集                            |  我们的方案   | Qwen2.5-7B-Instruct（基线） |
+|:--:|:---------------------------------------------------------:|:--------:|:-----------------------:|
+| 植物 | [SeedBench](https://github.com/open-sciencelab/SeedBench) | **65.9** |          51.5           |
+| 常识 |                           CMMLU                           |   73.6   |        **75.8**         |
+| 知识 |                       GPQA-Diamond                        | **40.0** |          33.3           |
+| 数学 |                          AIME24                           | **20.6** |          16.7           |
+|    |                          AIME25                           | **22.7** |           7.2           |
 
 GraphGen 首先根据源文本构建细粒度的知识图谱，然后利用期望校准误差指标识别大语言模型中的知识缺口，优先生成针对高价值长尾知识的问答对。  
 此外，GraphGen 采用多跳邻域采样捕获复杂关系信息，并使用风格控制生成来丰富问答数据的多样性。
 
 在数据生成后，您可以使用[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) 和 [xtuner](https://github.com/InternLM/xtuner)对大语言模型进行微调。
 
 ## 📌 最新更新
-
+- **2025.10.30** 我们支持多种新的 LLM 客户端和推理后端，包括 [Ollama_client]([Ollama_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/ollama_client.py), [http_client](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/api/http_client.py), [HuggingFace Transformers](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/hf_wrapper.py) 和 [SGLang](https://github.com/open-sciencelab/GraphGen/blob/main/graphgen/models/llm/local/sglang_wrapper.py).
 - **2025.10.23**：我们现在支持视觉问答（VQA）数据生成。运行脚本：`bash scripts/generate/generate_vqa.sh`。
 - **2025.10.21**：我们现在通过 [MinerU](https://github.com/opendatalab/MinerU) 支持 PDF 作为数据生成的输入格式。
-- **2025.09.29**：我们在 [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) 和 [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen) 上自动更新 Gradio 应用。
 
 <details>
 <summary>历史更新</summary>
 
+- **2025.09.29**：我们在 [Hugging Face](https://huggingface.co/spaces/chenzihong/GraphGen) 和 [ModelScope](https://modelscope.cn/studios/chenzihong/GraphGen) 上自动更新 Gradio 应用。
 - **2025.08.14**：支持利用 Leiden 社区发现算法对知识图谱进行社区划分，合成 CoT 数据。
 - **2025.07.31**：新增 Google、Bing、Wikipedia 和 UniProt 作为搜索后端，帮助填补数据缺口。  
 - **2025.04.21**：发布 GraphGen 初始版本。
 
 </details>
 
+## ⚙️ 支持列表
+
+我们支持多种 LLM 推理服务器、API 服务器、推理客户端、输入文件格式、数据模态、输出数据格式和输出数据类型。
+可以根据合成数据的需求进行灵活配置。
+
+| 推理服务器                                        | API 服务器                                                                        | 推理客户端                                                      | 输入文件格式                             | 数据模态         | 输出数据格式                       | 输出数据类型                                          |
+|----------------------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------------|------------------------------------|--------------|------------------------------|-------------------------------------------------|
+| [![hf-icon]HF][hf]<br>[![sg-icon]SGLang][sg] | [![sif-icon]Silicon][sif]<br>[![oai-icon]OpenAI][oai]<br>[![az-icon]Azure][az] | HTTP<br>[![ol-icon]Ollama][ol]<br>[![oai-icon]OpenAI][oai] | CSV<br>JSON<br>JSONL<br>PDF<br>TXT | TEXT<br>TEXT | Alpaca<br>ChatML<br>Sharegpt | Aggregated<br>Atomic<br>CoT<br>Multi-hop<br>VQA |
+
+<!-- links -->
+[hf]: https://huggingface.co/docs/transformers/index
+[sg]: https://docs.sglang.ai
+[sif]: https://siliconflow.cn
+[oai]: https://openai.com
+[az]: https://azure.microsoft.com/en-us/services/cognitive-services/openai-service/
+[ol]: https://ollama.com
+
+<!-- icons -->
+[hf-icon]: https://www.google.com/s2/favicons?domain=https://huggingface.co
+[sg-icon]: https://www.google.com/s2/favicons?domain=https://docs.sglang.ai
+[sif-icon]: https://www.google.com/s2/favicons?domain=siliconflow.com
+[oai-icon]: https://www.google.com/s2/favicons?domain=https://openai.com
+[az-icon]: https://www.google.com/s2/favicons?domain=https://azure.microsoft.com
+[ol-icon]: https://www.google.com/s2/favicons?domain=https://ollama.com
+
 
 ## 🚀 快速开始
 
 
@@ -1,6 +1,6 @@
 from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
-from .base_llm_client import BaseLLMClient
+from .base_llm_wrapper import BaseLLMWrapper
 from .base_partitioner import BasePartitioner
 from .base_reader import BaseReader
 from .base_splitter import BaseSplitter
 
@@ -1,15 +1,15 @@
 from abc import ABC, abstractmethod
 from typing import Any
 
-from graphgen.bases.base_llm_client import BaseLLMClient
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
 
 
 class BaseGenerator(ABC):
     """
     Generate QAs based on given prompts.
     """
 
-    def __init__(self, llm_client: BaseLLMClient):
+    def __init__(self, llm_client: BaseLLMWrapper):
         self.llm_client = llm_client
 
     @staticmethod
 
@@ -2,13 +2,13 @@
 from collections import defaultdict
 from typing import Dict, List, Tuple
 
-from graphgen.bases.base_llm_client import BaseLLMClient
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
 from graphgen.bases.base_storage import BaseGraphStorage
 from graphgen.bases.datatypes import Chunk
 
 
 class BaseKGBuilder(ABC):
-    def __init__(self, llm_client: BaseLLMClient):
+    def __init__(self, llm_client: BaseLLMWrapper):
         self.llm_client = llm_client
         self._nodes: Dict[str, List[dict]] = defaultdict(list)
         self._edges: Dict[Tuple[str, str], List[dict]] = defaultdict(list)
 
@@ -8,7 +8,7 @@
 from graphgen.bases.datatypes import Token
 
 
-class BaseLLMClient(abc.ABC):
+class BaseLLMWrapper(abc.ABC):
     """
     LLM client base class, agnostic to specific backends (OpenAI / Ollama / ...).
     """
@@ -66,3 +66,9 @@ def filter_think_tags(text: str, think_tag: str = "think") -> str:
         think_pattern = re.compile(rf"<{think_tag}>.*?</{think_tag}>", re.DOTALL)
         filtered_text = think_pattern.sub("", text).strip()
         return filtered_text if filtered_text else text.strip()
+
+    def shutdown(self) -> None:
+        """Shutdown the LLM engine if applicable."""
+
+    def restart(self) -> None:
+        """Reinitialize the LLM engine if applicable."""
@@ -5,6 +5,7 @@
 
 import gradio as gr
 
+from graphgen.bases import BaseLLMWrapper
 from graphgen.bases.base_storage import StorageNameSpace
 from graphgen.bases.datatypes import Chunk
 from graphgen.models import (
@@ -18,6 +19,7 @@
     build_kg,
     chunk_documents,
     generate_qas,
+    init_llm,
     judge_statement,
     partition_kg,
     quiz,
@@ -39,30 +41,18 @@ def __init__(
         trainee_llm_client: OpenAIClient = None,
         progress_bar: gr.Progress = None,
     ):
-        self.unique_id = unique_id
-        self.working_dir = working_dir
+        self.unique_id: int = unique_id
+        self.working_dir: str = working_dir
 
         # llm
         self.tokenizer_instance: Tokenizer = tokenizer_instance or Tokenizer(
             model_name=os.getenv("TOKENIZER_MODEL")
         )
 
-        self.synthesizer_llm_client: OpenAIClient = (
-            synthesizer_llm_client
-            or OpenAIClient(
-                model_name=os.getenv("SYNTHESIZER_MODEL"),
-                api_key=os.getenv("SYNTHESIZER_API_KEY"),
-                base_url=os.getenv("SYNTHESIZER_BASE_URL"),
-                tokenizer=self.tokenizer_instance,
-            )
-        )
-
-        self.trainee_llm_client: OpenAIClient = trainee_llm_client or OpenAIClient(
-            model_name=os.getenv("TRAINEE_MODEL"),
-            api_key=os.getenv("TRAINEE_API_KEY"),
-            base_url=os.getenv("TRAINEE_BASE_URL"),
-            tokenizer=self.tokenizer_instance,
+        self.synthesizer_llm_client: BaseLLMWrapper = (
+            synthesizer_llm_client or init_llm("synthesizer")
         )
+        self.trainee_llm_client: BaseLLMWrapper = trainee_llm_client
 
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
@@ -210,16 +200,29 @@ async def quiz_and_judge(self, quiz_and_judge_config: Dict):
         )
 
         # TODO： assert trainee_llm_client is valid before judge
+        if not self.trainee_llm_client:
+            # TODO: shutdown existing synthesizer_llm_client properly
+            logger.info("No trainee LLM client provided, initializing a new one.")
+            self.synthesizer_llm_client.shutdown()
+            self.trainee_llm_client = init_llm("trainee")
+
         re_judge = quiz_and_judge_config["re_judge"]
         _update_relations = await judge_statement(
             self.trainee_llm_client,
             self.graph_storage,
             self.rephrase_storage,
             re_judge,
         )
+
         await self.rephrase_storage.index_done_callback()
         await _update_relations.index_done_callback()
 
+        logger.info("Shutting down trainee LLM client.")
+        self.trainee_llm_client.shutdown()
+        self.trainee_llm_client = None
+        logger.info("Restarting synthesizer LLM client.")
+        self.synthesizer_llm_client.restart()
+
     @async_to_sync_method
     async def generate(self, partition_config: Dict, generate_config: Dict):
         # Step 1: partition the graph
 
@@ -7,8 +7,7 @@
     VQAGenerator,
 )
 from .kg_builder import LightRAGKGBuilder, MMKGBuilder
-from .llm.openai_client import OpenAIClient
-from .llm.topk_token_model import TopkTokenModel
+from .llm import HTTPClient, OllamaClient, OpenAIClient
 from .partitioner import (
     AnchorBFSPartitioner,
     BFSPartitioner,