InternScience
diff --git a/‎graphgen/configs/protein_qa_config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎graphgen/configs/protein_qa_config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphgen/models/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎graphgen/models/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphgen/models/kg_builder/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎graphgen/models/kg_builder/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎graphgen/models/kg_builder/mo_kg_builder.py‎
Lines changed: 79 additions & 2 deletions b/‎graphgen/models/kg_builder/mo_kg_builder.py‎
Lines changed: 79 additions & 2 deletions
diff --git a/‎graphgen/models/search/db/uniprot_search.py‎
Lines changed: 102 additions & 46 deletions b/‎graphgen/models/search/db/uniprot_search.py‎
Lines changed: 102 additions & 46 deletions
diff --git a/‎graphgen/operators/build_kg/build_kg.py‎
Lines changed: 17 additions & 12 deletions b/‎graphgen/operators/build_kg/build_kg.py‎
Lines changed: 17 additions & 12 deletions
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/protein_qa_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  input_file: resources/input_examples/protein_qa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
   anchor_type: protein # get protein information from chunks
 split:
   chunk_size: 1024 # chunk size for text splitting
 
@@ -6,7 +6,7 @@
     MultiHopGenerator,
     VQAGenerator,
 )
-from .kg_builder import LightRAGKGBuilder, MMKGBuilder
+from .kg_builder import LightRAGKGBuilder, MMKGBuilder, MOKGBuilder
 from .llm import HTTPClient, OllamaClient, OpenAIClient
 from .partitioner import (
     AnchorBFSPartitioner,
 
@@ -1,2 +1,3 @@
 from .light_rag_kg_builder import LightRAGKGBuilder
 from .mm_kg_builder import MMKGBuilder
+from .mo_kg_builder import MOKGBuilder
@@ -1,11 +1,36 @@
+import re
+from collections import defaultdict
 from typing import Dict, List, Tuple
 
 from graphgen.bases import Chunk
+from graphgen.templates import PROTEIN_KG_EXTRACTION_PROMPT
+from graphgen.utils import (
+    detect_main_language,
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    logger,
+    split_string_by_multi_markers,
+)
 
 from .light_rag_kg_builder import LightRAGKGBuilder
 
 
 class MOKGBuilder(LightRAGKGBuilder):
+    @staticmethod
+    async def scan_document_for_schema(
+        chunk: Chunk, schema: Dict[str, List[str]]
+    ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
+        """
+        Scan the document chunk to extract entities and relationships based on the provided schema.
+        :param chunk: The document chunk to be scanned.
+        :param schema: A dictionary defining the entities and relationships to be extracted.
+        :return: A tuple containing two dictionaries - one for entities and one for relationships.
+        """
+        # TODO: use hard-coded PROTEIN_KG_EXTRACTION_PROMPT for protein chunks,
+        #  support schema for other chunk types later
+        print(chunk.id, schema)
+        return {}, {}
+
     async def extract(
         self, chunk: Chunk
     ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
@@ -19,5 +44,57 @@ async def extract(
         :return: Tuple containing entities and relationships.
         """
         # TODO: Implement the multi-omics KG extraction logic here
-        print(chunk)
-        return {}, {}
+        chunk_id = chunk.id
+        chunk_type = chunk.type  # genome | protein | ...
+        metadata = chunk.metadata
+
+        # choose different extraction strategies based on chunk type
+        if chunk_type == "protein":
+            protein_caption = ""
+            for key, value in metadata["protein_caption"].items():
+                protein_caption += f"{key}: {value}\n"
+            logger.debug("Protein chunk caption: %s", protein_caption)
+
+            language = detect_main_language(protein_caption)
+            prompt_template = PROTEIN_KG_EXTRACTION_PROMPT[language].format(
+                **PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"],
+                input_text=protein_caption,
+            )
+            result = await self.llm_client.generate_answer(prompt_template)
+            logger.debug("Protein chunk extraction result: %s", result)
+
+            # parse the result
+            records = split_string_by_multi_markers(
+                result,
+                [
+                    PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                    PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+                ],
+            )
+
+            nodes = defaultdict(list)
+            edges = defaultdict(list)
+
+            for record in records:
+                match = re.search(r"\((.*)\)", record)
+                if not match:
+                    continue
+                inner = match.group(1)
+
+                attributes = split_string_by_multi_markers(
+                    inner, [PROTEIN_KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
+                )
+
+                entity = await handle_single_entity_extraction(attributes, chunk_id)
+                if entity is not None:
+                    nodes[entity["entity_name"]].append(entity)
+                    continue
+
+                relation = await handle_single_relationship_extraction(
+                    attributes, chunk_id
+                )
+                if relation is not None:
+                    key = (relation["src_id"], relation["tgt_id"])
+                    edges[key].append(relation)
+
+            return dict(nodes), dict(edges)
@@ -1,61 +1,117 @@
-import requests
-from fastapi import HTTPException
+from io import StringIO
+from typing import Dict, Optional
 
-from graphgen.utils import logger
+from Bio import ExPASy, SeqIO, SwissProt, UniProt
+from Bio.Blast import NCBIWWW, NCBIXML
 
-UNIPROT_BASE = "https://rest.uniprot.org/uniprotkb/search"
+from graphgen.utils import logger
 
 
 class UniProtSearch:
     """
     UniProt Search client to search with UniProt.
     1) Get the protein by accession number.
-    2) Search with keywords or protein names.
+    2) Search with keywords or protein names (fuzzy search).
     """
 
-    def get_entry(self, accession: str) -> dict:
+    def get_by_accession(self, accession: str) -> Optional[dict]:
+        try:
+            handle = ExPASy.get_sprot_raw(accession)
+            record = SwissProt.read(handle)
+            handle.close()
+            return self._swissprot_to_dict(record)
+        except Exception as exc:  # pylint: disable=broad-except
+            logger.error("Accession %s not found: %s", accession, exc)
+            return None
+
+    @staticmethod
+    def _swissprot_to_dict(record: SwissProt.Record) -> dict:
+        """error
+        Convert a SwissProt.Record to a dictionary.
         """
-        Get the UniProt entry by accession number(e.g., P04637).
+        functions = []
+        for line in record.comments:
+            if line.startswith("FUNCTION:"):
+                functions.append(line[9:].strip())
+
+        return {
+            "molecule_type": "protein",
+            "database": "UniProt",
+            "id": record.accessions[0],
+            "entry_name": record.entry_name,
+            "gene_names": record.gene_name,
+            "protein_name": record.description.split(";")[0].split("=")[-1],
+            "organism": record.organism.split(" (")[0],
+            "sequence": str(record.sequence),
+            "function": functions,
+            "url": f"https://www.uniprot.org/uniprot/{record.accessions[0]}",
+        }
+
+    def get_best_hit(self, keyword: str) -> Optional[Dict]:
         """
-        url = f"{UNIPROT_BASE}/{accession}.json"
-        return self._safe_get(url).json()
-
-    def search(
-        self,
-        query: str,
-        *,
-        size: int = 10,
-        cursor: str = None,
-        fields: list[str] = None,
-    ) -> dict:
+        Search UniProt with a keyword and return the best hit.
+        :param keyword: The search keyword.
+        :return: A dictionary containing the best hit information or None if not found.
         """
-        Search UniProt with a query string.
-        :param query: The search query.
-        :param size: The number of results to return.
-        :param cursor: The cursor for pagination.
-        :param fields: The fields to return in the response.
-        :return: A dictionary containing the search results.
+        if not keyword.strip():
+            return None
+
+        try:
+            iterator = UniProt.search(keyword, fields=None, batch_size=1)
+            hit = next(iterator, None)
+            if hit is None:
+                return None
+            return self.get_by_accession(hit["primaryAccession"])
+
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Keyword %s not found: %s", keyword, e)
+            return None
+
+    def get_by_fasta(self, fasta_sequence: str, threshold: float) -> Optional[Dict]:
         """
-        params = {
-            "query": query,
-            "size": size,
-        }
-        if cursor:
-            params["cursor"] = cursor
-        if fields:
-            params["fields"] = ",".join(fields)
-        url = UNIPROT_BASE
-        return self._safe_get(url, params=params).json()
+        Search UniProt with a FASTA sequence and return the best hit.
+        :param fasta_sequence: The FASTA sequence.
+        :param threshold: E-value threshold for BLAST search.
+        :return: A dictionary containing the best hit information or None if not found.
+        """
+        try:
+            if fasta_sequence.startswith(">"):
+                seq = str(list(SeqIO.parse(StringIO(fasta_sequence), "fasta"))[0].seq)
+            else:
+                seq = fasta_sequence.strip()
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Invalid FASTA sequence: %s", e)
+            return None
 
-    @staticmethod
-    def _safe_get(url: str, params: dict = None) -> requests.Response:
-        r = requests.get(
-            url,
-            params=params,
-            headers={"Accept": "application/json"},
-            timeout=10,
-        )
-        if not r.ok:
-            logger.error("Search engine error: %s", r.text)
-            raise HTTPException(r.status_code, "Search engine error.")
-        return r
+        if not seq:
+            logger.error("Empty FASTA sequence provided.")
+            return None
+
+        # UniProtKB/Swiss-Prot BLAST API
+        try:
+            result_handle = NCBIWWW.qblast(
+                program="blastp",
+                database="swissprot",
+                sequence=seq,
+                hitlist_size=1,
+                expect=threshold,
+            )
+            blast_record = NCBIXML.read(result_handle)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("BLAST search failed: %s", e)
+            return None
+
+        if not blast_record.alignments:
+            logger.info("No BLAST hits found for the given sequence.")
+            return None
+
+        best_alignment = blast_record.alignments[0]
+        best_hsp = best_alignment.hsps[0]
+        if best_hsp.expect > threshold:
+            logger.info("No BLAST hits below the threshold E-value.")
+            return None
+        hit_id = best_alignment.hit_id
+
+        # like sp|P01308.1|INS_HUMAN
+        accession = hit_id.split("|")[1].split(".")[0] if "|" in hit_id else hit_id
+        return self.get_by_accession(accession)
@@ -30,7 +30,12 @@ async def build_kg(
     """
 
     text_chunks = [chunk for chunk in chunks if chunk.type == "text"]
-    mm_chunks = [chunk for chunk in chunks if chunk.type != "text"]
+    mm_chunks = [
+        chunk
+        for chunk in chunks
+        if chunk.type in ("image", "video", "table", "formula")
+    ]
+    mo_chunks = [chunk for chunk in chunks if chunk.type in ("genome", "protein")]
 
     if len(text_chunks) == 0:
         logger.info("All text chunks are already in the storage")
@@ -42,6 +47,7 @@ async def build_kg(
             chunks=text_chunks,
             progress_bar=progress_bar,
         )
+
     if len(mm_chunks) == 0:
         logger.info("All multi-modal chunks are already in the storage")
     else:
@@ -53,16 +59,15 @@ async def build_kg(
             progress_bar=progress_bar,
         )
 
-    if anchor_type is not None:
-        logger.info("Anchoring data based on %s ...", anchor_type)
-        if anchor_type == "protein":
-            await build_mo_kg(
-                llm_client=llm_client,
-                kg_instance=kg_instance,
-                chunks=text_chunks,
-                progress_bar=progress_bar,
-            )
-        else:
-            logger.error("Anchor type %s is not supported yet.", anchor_type)
+    if len(mo_chunks) == 0:
+        logger.info("All multi-omics chunks are already in the storage")
+    else:
+        logger.info("[Multi-omics Entity and Relation Extraction] processing ...")
+        await build_mo_kg(
+            llm_client=llm_client,
+            kg_instance=kg_instance,
+            chunks=mo_chunks,
+            progress_bar=progress_bar,
+        )
 
     return kg_instance
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`MultiHopGenerator,`
`7`	`7`	`VQAGenerator,`
`8`	`8`	`)`
`9`		`-from .kg_builder import LightRAGKGBuilder, MMKGBuilder`
	`9`	`+from .kg_builder import LightRAGKGBuilder, MMKGBuilder, MOKGBuilder`
`10`	`10`	`from .llm import HTTPClient, OllamaClient, OpenAIClient`
`11`	`11`	`from .partitioner import (`
`12`	`12`	`AnchorBFSPartitioner,`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from .light_rag_kg_builder import LightRAGKGBuilder`
`2`	`2`	`from .mm_kg_builder import MMKGBuilder`
	`3`	`+from .mo_kg_builder import MOKGBuilder`