InternScience
diff --git a/‎graphgen/bases/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎graphgen/bases/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎graphgen/bases/base_extractor.py‎
Lines changed: 22 additions & 0 deletions b/‎graphgen/bases/base_extractor.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎graphgen/bases/base_storage.py‎
Lines changed: 3 additions & 0 deletions b/‎graphgen/bases/base_storage.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 5 additions & 2 deletions b/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 3 additions & 0 deletions b/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎graphgen/configs/cot_config.yaml‎
Lines changed: 5 additions & 2 deletions b/‎graphgen/configs/cot_config.yaml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 3 additions & 0 deletions b/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎graphgen/configs/schema_guided_config.yaml‎
Lines changed: 15 additions & 0 deletions b/‎graphgen/configs/schema_guided_config.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎graphgen/configs/vqa_config.yaml‎
Lines changed: 5 additions & 2 deletions b/‎graphgen/configs/vqa_config.yaml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎graphgen/graphgen.py‎
Lines changed: 46 additions & 6 deletions b/‎graphgen/graphgen.py‎
Lines changed: 46 additions & 6 deletions
@@ -1,3 +1,4 @@
+from .base_extractor import BaseExtractor
 from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_wrapper import BaseLLMWrapper
 
@@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+
+
+class BaseExtractor(ABC):
+    """
+    Extract information from given text.
+
+    """
+
+    def __init__(self, llm_client: BaseLLMWrapper):
+        self.llm_client = llm_client
+
+    @abstractmethod
+    async def extract(self, chunk: dict) -> Any:
+        """Extract information from the given text"""
+
+    @abstractmethod
+    def build_prompt(self, text: str) -> str:
+        """Build prompt for LLM based on the given text"""
@@ -45,6 +45,9 @@ async def get_by_ids(
     ) -> list[Union[T, None]]:
         raise NotImplementedError
 
+    async def get_all(self) -> dict[str, T]:
+        raise NotImplementedError
+
     async def filter_keys(self, data: list[str]) -> set[str]:
         """return un-exist keys"""
         raise NotImplementedError
 
@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
 
   - name: build_kg
 
 
@@ -2,6 +2,9 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
 
@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
 
   - name: build_kg
 
 
@@ -2,6 +2,9 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting
 
 
@@ -0,0 +1,15 @@
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
+      chunk_size: 20480
+      chunk_overlap: 2000
+      separators: []
+
+  - name: extract
+    params:
+      method: schema_guided # extraction method, support: schema_guided
+      schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
 
   - name: build_kg
 
 
@@ -18,6 +18,7 @@
 from graphgen.operators import (
     build_kg,
     chunk_documents,
+    extract_info,
     generate_qas,
     init_llm,
     judge_statement,
@@ -70,6 +71,7 @@ def __init__(
         self.search_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="search"
         )
+
         self.rephrase_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="rephrase"
         )
@@ -80,6 +82,10 @@ def __init__(
             os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
             namespace="qa",
         )
+        self.extract_storage: JsonKVStorage = JsonKVStorage(
+            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
+            namespace="extraction",
+        )
 
         # webui
         self.progress_bar: gr.Progress = progress_bar
@@ -103,16 +109,30 @@ async def read(self, read_config: Dict):
         _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
         new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
 
+        if len(new_docs) == 0:
+            logger.warning("All documents are already in the storage")
+            return
+
+        await self.full_docs_storage.upsert(new_docs)
+        await self.full_docs_storage.index_done_callback()
+
+    @op("chunk", deps=["read"])
+    @async_to_sync_method
+    async def chunk(self, chunk_config: Dict):
+        """
+        chunk documents into smaller pieces from full_docs_storage if not already present
+        """
+
+        new_docs = await self.meta_storage.get_new_data(self.full_docs_storage)
         if len(new_docs) == 0:
             logger.warning("All documents are already in the storage")
             return
 
         inserting_chunks = await chunk_documents(
             new_docs,
-            read_config["chunk_size"],
-            read_config["chunk_overlap"],
             self.tokenizer_instance,
             self.progress_bar,
+            **chunk_config,
         )
 
         _add_chunk_keys = await self.chunks_storage.filter_keys(
@@ -126,12 +146,12 @@ async def read(self, read_config: Dict):
             logger.warning("All chunks are already in the storage")
             return
 
-        await self.full_docs_storage.upsert(new_docs)
-        await self.full_docs_storage.index_done_callback()
         await self.chunks_storage.upsert(inserting_chunks)
         await self.chunks_storage.index_done_callback()
+        await self.meta_storage.mark_done(self.full_docs_storage)
+        await self.meta_storage.index_done_callback()
 
-    @op("build_kg", deps=["read"])
+    @op("build_kg", deps=["chunk"])
     @async_to_sync_method
     async def build_kg(self):
         """
@@ -161,7 +181,7 @@ async def build_kg(self):
 
         return _add_entities_and_relations
 
-    @op("search", deps=["read"])
+    @op("search", deps=["chunk"])
     @async_to_sync_method
     async def search(self, search_config: Dict):
         logger.info(
@@ -248,6 +268,26 @@ async def partition(self, partition_config: Dict):
         await self.partition_storage.upsert(batches)
         return batches
 
+    @op("extract", deps=["chunk"])
+    @async_to_sync_method
+    async def extract(self, extract_config: Dict):
+        logger.info("Extracting information from given chunks...")
+
+        results = await extract_info(
+            self.synthesizer_llm_client,
+            self.chunks_storage,
+            extract_config,
+            progress_bar=self.progress_bar,
+        )
+        if not results:
+            logger.warning("No information extracted")
+            return
+
+        await self.extract_storage.upsert(results)
+        await self.extract_storage.index_done_callback()
+        await self.meta_storage.mark_done(self.chunks_storage)
+        await self.meta_storage.index_done_callback()
+
     @op("generate", deps=["partition"])
     @async_to_sync_method
     async def generate(self, generate_config: Dict):
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+from .base_extractor import BaseExtractor`
`1`	`2`	`from .base_generator import BaseGenerator`
`2`	`3`	`from .base_kg_builder import BaseKGBuilder`
`3`	`4`	`from .base_llm_wrapper import BaseLLMWrapper`