improvement(app.py): use async for

tpoisonooo · tpoisonooo · commit 8b68414aa68a · 2025-04-21T13:48:16.000+08:00
diff --git a/README.md b/README.md
@@ -108,3 +108,9 @@ Experience it on the [OpenXLab Application Center](https://openxlab.org.cn/apps/
 
 ### Workflow
 ![workflow](resources/images/flow.png)
+
+
+## 🍀 Acknowledgements
+- [SiliconCloud](https://siliconflow.cn) Abundant LLM API, some models are free
+- [LightRAG](https://github.com/HKUDS/LightRAG) Simple and efficient graph retrieval solution
+- [ROGRAG](https://github.com/tpoisonooo/ROGRAG) ROGRAG: A Robustly Optimized GraphRAG Framework
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -85,8 +85,8 @@ async def async_split_chunks(self, data: Union[List[list], List[dict]], data_typ
 
             cur_index = 1
             doc_number = len(new_docs)
-            for doc_key, doc in tqdm_async(
-                    new_docs.items(), desc="Chunking documents", unit="doc"
+            async for doc_key, doc in tqdm_async(
+                    new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
                 ):
                 chunks = {
                     compute_content_hash(dp["content"], prefix="chunk-"): {
@@ -117,7 +117,7 @@ async def async_split_chunks(self, data: Union[List[list], List[dict]], data_typ
                 logger.warning("All docs are already in the storage")
                 return {}
             logger.info("[New Docs] inserting %d docs", len(new_docs))
-            for doc in tqdm_async(data, desc="Chunking documents", unit="doc"):
+            async for doc in tqdm_async(data, desc="[1/4]Chunking documents", unit="doc"):
                 doc_str = "".join([chunk['content'] for chunk in doc])
                 for chunk in doc:
                     chunk_key = compute_content_hash(chunk['content'], prefix="chunk-")
diff --git a/graphgen/operators/extract_kg.py b/graphgen/operators/extract_kg.py
@@ -103,16 +103,16 @@ async def _process_single_content(chunk: Chunk, max_loop: int = 3):
 
     results = []
     chunk_number = len(chunks)
-    for result in tqdm_async(
+    async for result in tqdm_async(
         asyncio.as_completed([_process_single_content(c) for c in chunks]),
         total=len(chunks),
-        desc="Extracting entities and relationships from chunks",
+        desc="[3/4]Extracting entities and relationships from chunks",
         unit="chunk",
     ):
         try:
             results.append(await result)
             if progress_bar is not None:
-                progress_bar(len(results) / chunk_number, desc="Extracting entities and relationships from chunks")
+                progress_bar(len(results) / chunk_number, desc="[3/4]Extracting entities and relationships from chunks")
         except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while extracting entities and relationships from chunks: %s", e)
 
diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py
@@ -292,11 +292,11 @@ async def _process_single_batch(
 
     for result in tqdm_async(asyncio.as_completed(
         [_process_single_batch(batch) for batch in processing_batches]
-    ), total=len(processing_batches), desc="Generating QAs"):
+    ), total=len(processing_batches), desc="[4/4]Generating QAs"):
         try:
             results.update(await result)
             if progress_bar is not None:
-                progress_bar(len(results) / len(processing_batches), desc="Generating QAs")
+                progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs")
         except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
 
@@ -398,12 +398,12 @@ async def _generate_question(
     for result in tqdm_async(
         asyncio.as_completed([_generate_question(task) for task in tasks]),
         total=len(tasks),
-        desc="Generating QAs"
+        desc="[4/4]Generating QAs"
     ):
         try:
             results.update(await result)
             if progress_bar is not None:
-                progress_bar(len(results) / len(tasks), desc="Generating QAs")
+                progress_bar(len(results) / len(tasks), desc="[4/4]Generating QAs")
         except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
     return results
@@ -507,15 +507,15 @@ async def _process_single_batch(
                 logger.error("Error occurred while processing batch: %s", e)
                 return {}
 
-    for result in tqdm_async(
+    async for result in tqdm_async(
         asyncio.as_completed([_process_single_batch(batch) for batch in processing_batches]),
         total=len(processing_batches),
-        desc="Generating QAs"
+        desc="[4/4]Generating QAs"
     ):
         try:
             results.update(await result)
             if progress_bar is not None:
-                progress_bar(len(results) / len(processing_batches), desc="Generating QAs")
+                progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs")
         except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
     return results
diff --git a/webui/app.py b/webui/app.py
@@ -119,7 +119,7 @@ def sum_tokens(client):
     # Initialize GraphGen
     graph_gen = init_graph_gen(config, env)
     graph_gen.clear()
-    progress(0.2, "Model Initialized")
+    progress(0.2, "[2/4]Model Initialized")
 
     graph_gen.progress_bar = progress
 
@@ -378,7 +378,7 @@ def sum_tokens(client):
                 with gr.Column():
                     rpm = gr.Slider(
                         label="RPM",
-                        minimum=500,
+                        minimum=10,
                         maximum=10000,
                         value=1000,
                         step=100,
@@ -388,7 +388,7 @@ def sum_tokens(client):
                     tpm = gr.Slider(
                         label="TPM",
                         minimum=5000,
-                        maximum=100000,
+                        maximum=5000000,
                         value=50000,
                         step=1000,
                         interactive=True,
@@ -435,9 +435,11 @@ def sum_tokens(client):
             test_api_connection,
             inputs=[base_url, api_key, synthesizer_model],
             outputs=[])
-        test_connection_btn.click(test_api_connection,
-                                  inputs=[base_url, api_key, trainee_model],
-                                  outputs=[])
+        
+        if if_trainee_model.value:
+            test_connection_btn.click(test_api_connection,
+                                    inputs=[base_url, api_key, trainee_model],
+                                    outputs=[])
 
         expand_method.change(lambda method:
                              (gr.update(visible=method == "max_width"),
diff --git a/webui/translation.json b/webui/translation.json
@@ -16,7 +16,7 @@
   },
   "zh": {
     "Title": "✨开箱即用的LLM训练数据生成框架✨",
-    "Intro": "是一个基于知识图谱的合成数据生成框架，旨在解决知识密集型问答生成的挑战。\n\n 上传你的文本块（如农业、医疗、海洋知识），填写 LLM api key，即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
+    "Intro": "是一个基于知识图谱的数据合成框架，旨在知识密集型任务中生成问答。\n\n 上传你的文本块（如农业、医疗、海洋知识），填写 LLM api key，即可在线生成 **[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)**、**[xtuner](https://github.com/InternLM/xtuner)** 所需训练数据。结束后我们将自动删除用户信息。",
     "Use Trainee Model": "使用Trainee Model来识别知识盲区，使用硅基流动时请保持禁用",
     "Base URL Info": "调用模型API的URL，默认使用硅基流动",
     "Synthesizer Model Info": "用于构建知识图谱和生成问答的模型",