1- import asyncio
21import os
32import time
4- from typing import Dict , cast
3+ from typing import Dict
54
65import gradio as gr
76
87from graphgen .bases import BaseLLMWrapper
9- from graphgen .bases .base_storage import StorageNameSpace
108from graphgen .bases .datatypes import Chunk
119from graphgen .engine import op
1210from graphgen .models import (
1311 JsonKVStorage ,
1412 JsonListStorage ,
13+ MetaJsonKVStorage ,
1514 NetworkXStorage ,
1615 OpenAIClient ,
1716 Tokenizer ,
@@ -56,6 +55,10 @@ def __init__(
5655 )
5756 self .trainee_llm_client : BaseLLMWrapper = trainee_llm_client
5857
58+ self .meta_storage : MetaJsonKVStorage = MetaJsonKVStorage (
59+ self .working_dir , namespace = "_meta"
60+ )
61+
5962 self .full_docs_storage : JsonKVStorage = JsonKVStorage (
6063 self .working_dir , namespace = "full_docs"
6164 )
@@ -82,14 +85,13 @@ def __init__(
8285 # webui
8386 self .progress_bar : gr .Progress = progress_bar
8487
85- @op ("insert " , deps = [])
88+ @op ("read " , deps = [])
8689 @async_to_sync_method
87- async def insert (self , insert_config : Dict ):
90+ async def read (self , read_config : Dict ):
8891 """
89- insert chunks into the graph
92+ read files from input sources
9093 """
91- # Step 1: Read files
92- data = read_files (insert_config ["input_file" ], self .working_dir )
94+ data = read_files (read_config ["input_file" ], self .working_dir )
9395 if len (data ) == 0 :
9496 logger .warning ("No data to process" )
9597 return
@@ -108,8 +110,8 @@ async def insert(self, insert_config: Dict):
108110
109111 inserting_chunks = await chunk_documents (
110112 new_docs ,
111- insert_config ["chunk_size" ],
112- insert_config ["chunk_overlap" ],
113+ read_config ["chunk_size" ],
114+ read_config ["chunk_overlap" ],
113115 self .tokenizer_instance ,
114116 self .progress_bar ,
115117 )
@@ -125,9 +127,25 @@ async def insert(self, insert_config: Dict):
125127 logger .warning ("All chunks are already in the storage" )
126128 return
127129
128- logger .info ("[New Chunks] inserting %d chunks" , len (inserting_chunks ))
130+ await self .full_docs_storage .upsert (new_docs )
131+ await self .full_docs_storage .index_done_callback ()
129132 await self .chunks_storage .upsert (inserting_chunks )
133+ await self .chunks_storage .index_done_callback ()
134+
135+ @op ("build_kg" , deps = ["read" ])
136+ @async_to_sync_method
137+ async def build_kg (self ):
138+ """
139+ build knowledge graph from text chunks
140+ """
141+ # Step 1: get new chunks according to meta and chunks storage
142+ inserting_chunks = await self .meta_storage .get_new_data (self .chunks_storage )
143+ if len (inserting_chunks ) == 0 :
144+ logger .warning ("All chunks are already in the storage" )
145+ return
130146
147+ logger .info ("[New Chunks] inserting %d chunks" , len (inserting_chunks ))
148+ # Step 2: build knowledge graph from new chunks
131149 _add_entities_and_relations = await build_kg (
132150 llm_client = self .synthesizer_llm_client ,
133151 kg_instance = self .graph_storage ,
@@ -138,23 +156,13 @@ async def insert(self, insert_config: Dict):
138156 logger .warning ("No entities or relations extracted from text chunks" )
139157 return
140158
141- await self ._insert_done ()
159+ # Step 3: mark meta
160+ await self .meta_storage .mark_done (self .chunks_storage )
161+ await self .meta_storage .index_done_callback ()
162+
142163 return _add_entities_and_relations
143164
144- async def _insert_done (self ):
145- tasks = []
146- for storage_instance in [
147- self .full_docs_storage ,
148- self .chunks_storage ,
149- self .graph_storage ,
150- self .search_storage ,
151- ]:
152- if storage_instance is None :
153- continue
154- tasks .append (cast (StorageNameSpace , storage_instance ).index_done_callback ())
155- await asyncio .gather (* tasks )
156-
157- @op ("search" , deps = ["insert" ])
165+ @op ("search" , deps = ["read" ])
158166 @async_to_sync_method
159167 async def search (self , search_config : Dict ):
160168 logger .info (
@@ -188,9 +196,9 @@ async def search(self, search_config: Dict):
188196 ]
189197 )
190198 # TODO: fix insert after search
191- await self .insert ()
199+ # await self.insert()
192200
193- @op ("quiz_and_judge" , deps = ["insert " ])
201+ @op ("quiz_and_judge" , deps = ["build_kg " ])
194202 @async_to_sync_method
195203 async def quiz_and_judge (self , quiz_and_judge_config : Dict ):
196204 logger .warning (
@@ -229,7 +237,7 @@ async def quiz_and_judge(self, quiz_and_judge_config: Dict):
229237 logger .info ("Restarting synthesizer LLM client." )
230238 self .synthesizer_llm_client .restart ()
231239
232- @op ("partition" , deps = ["insert " ])
240+ @op ("partition" , deps = ["build_kg " ])
233241 @async_to_sync_method
234242 async def partition (self , partition_config : Dict ):
235243 batches = await partition_kg (
@@ -257,7 +265,7 @@ async def extract(self, extract_config: Dict):
257265 return
258266 print (results )
259267
260- @op ("generate" , deps = ["insert" , " partition" ])
268+ @op ("generate" , deps = ["partition" ])
261269 @async_to_sync_method
262270 async def generate (self , generate_config : Dict ):
263271
@@ -295,6 +303,3 @@ async def clear(self):
295303
296304 # TODO: add data filtering step here in the future
297305 # graph_gen.filter(filter_config=config["filter"])
298-
299-
300- # TODO: 把insert拆成两个: read + build_kg,这样更合理
0 commit comments