diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py
index 118b5258..89778469 100644
--- a/graphgen/bases/base_reader.py
+++ b/graphgen/bases/base_reader.py
@@ -1,6 +1,9 @@
+import os
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List
 
+import requests
+
 
 class BaseReader(ABC):
     """
@@ -18,3 +21,45 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
         :param file_path: Path to the input file.
         :return: List of dictionaries containing the data.
         """
+
+    @staticmethod
+    def filter(data: List[dict]) -> List[dict]:
+        """
+        Filter out entries with empty or missing text in the specified column.
+
+        :param data: List of dictionaries containing the data.
+        :return: Filtered list of dictionaries.
+        """
+
+        def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+            """
+            Check if an image exists at the given local path or URL.
+            :param path_or_url: Local file path or remote URL of the image.
+            :param timeout: Timeout for remote URL requests in seconds.
+            :return: True if the image exists, False otherwise.
+            """
+            if not path_or_url:
+                return False
+            if not path_or_url.startswith(("http://", "https://", "ftp://")):
+                path = path_or_url.replace("file://", "", 1)
+                path = os.path.abspath(path)
+                return os.path.isfile(path)
+            try:
+                resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+                return resp.status_code == 200
+            except requests.RequestException:
+                return False
+
+        filtered_data = []
+        for item in data:
+            if item.get("type") == "text":
+                content = item.get("content", "").strip()
+                if content:
+                    filtered_data.append(item)
+            elif item.get("type") in ("image", "table", "equation"):
+                img_path = item.get("img_path")
+                if _image_exists(img_path):
+                    filtered_data.append(item)
+            else:
+                filtered_data.append(item)
+        return filtered_data
diff --git a/graphgen/bases/datatypes.py b/graphgen/bases/datatypes.py
index beb73a77..58dbda2e 100644
--- a/graphgen/bases/datatypes.py
+++ b/graphgen/bases/datatypes.py
@@ -7,8 +7,18 @@
 class Chunk:
     id: str
     content: str
+    type: str
     metadata: dict = field(default_factory=dict)
 
+    @staticmethod
+    def from_dict(key: str, data: dict) -> "Chunk":
+        return Chunk(
+            id=key,
+            content=data.get("content", ""),
+            type=data.get("type", "unknown"),
+            metadata={k: v for k, v in data.items() if k != "content"},
+        )
+
 
 @dataclass
 class QAPair:
diff --git a/graphgen/configs/vqa_config.yaml b/graphgen/configs/vqa_config.yaml
index 06ae04ff..37ed0e1f 100644
--- a/graphgen/configs/vqa_config.yaml
+++ b/graphgen/configs/vqa_config.yaml
@@ -1,5 +1,5 @@
 read:
-  input_file: resources/input_examples/pdf_demo.pdf # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 split:
   chunk_size: 1024 # chunk size for text splitting
   chunk_overlap: 100 # chunk overlap for text splitting
@@ -7,16 +7,12 @@ search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
 quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
+  enabled: false
 partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
+  method: anchor_bfs # partition method
   method_params:
-    max_units_per_community: 20 # max nodes and edges per community
-    min_units_per_community: 5 # min nodes and edges per community
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+    anchor_type: image # node type to select anchor nodes
+    max_units_per_community: 10 # atomic partition, one node or edge per community
 generate:
   mode: vqa # atomic, aggregated, multi_hop, cot, vqa
   data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
index 3a95a274..8b0559d6 100644
--- a/graphgen/graphgen.py
+++ b/graphgen/graphgen.py
@@ -16,7 +16,8 @@
     Tokenizer,
 )
 from graphgen.operators import (
-    build_kg,
+    build_mm_kg,
+    build_text_kg,
     chunk_documents,
     generate_qas,
     judge_statement,
@@ -25,7 +26,7 @@
     read_files,
     search_all,
 )
-from graphgen.utils import async_to_sync_method, compute_content_hash, logger
+from graphgen.utils import async_to_sync_method, compute_mm_hash, logger
 
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 
@@ -68,8 +69,8 @@ def __post_init__(self):
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
         )
-        self.text_chunks_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="text_chunks"
+        self.chunks_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="chunks"
         )
         self.graph_storage: NetworkXStorage = NetworkXStorage(
             self.working_dir, namespace="graph"
@@ -96,70 +97,122 @@ async def insert(self, read_config: Dict, split_config: Dict):
             logger.warning("No data to process")
             return
 
+        assert isinstance(data, list) and isinstance(data[0], dict)
+
         # TODO: configurable whether to use coreference resolution
 
-        # Step 2: Split chunks and filter existing ones
-        assert isinstance(data, list) and isinstance(data[0], dict)
-        new_docs = {
-            compute_content_hash(doc["content"], prefix="doc-"): {
-                "content": doc["content"]
-            }
-            for doc in data
-            if doc.get("type", "text") == "text"
-        }
+        new_docs = {compute_mm_hash(doc, prefix="doc-"): doc for doc in data}
         _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
         new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+        new_text_docs = {k: v for k, v in new_docs.items() if v.get("type") == "text"}
+        new_mm_docs = {k: v for k, v in new_docs.items() if v.get("type") != "text"}
 
-        if len(new_docs) == 0:
-            logger.warning("All docs are already in the storage")
-            return
-        logger.info("[New Docs] inserting %d docs", len(new_docs))
+        await self.full_docs_storage.upsert(new_docs)
 
-        inserting_chunks = await chunk_documents(
-            new_docs,
-            split_config["chunk_size"],
-            split_config["chunk_overlap"],
-            self.tokenizer_instance,
-            self.progress_bar,
-        )
+        async def _insert_text_docs(text_docs):
+            if len(text_docs) == 0:
+                logger.warning("All text docs are already in the storage")
+                return
+            logger.info("[New Docs] inserting %d text docs", len(text_docs))
+            # Step 2.1: Split chunks and filter existing ones
+            inserting_chunks = await chunk_documents(
+                text_docs,
+                split_config["chunk_size"],
+                split_config["chunk_overlap"],
+                self.tokenizer_instance,
+                self.progress_bar,
+            )
 
-        _add_chunk_keys = await self.text_chunks_storage.filter_keys(
-            list(inserting_chunks.keys())
-        )
-        inserting_chunks = {
-            k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
-        }
+            _add_chunk_keys = await self.chunks_storage.filter_keys(
+                list(inserting_chunks.keys())
+            )
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
 
-        if len(inserting_chunks) == 0:
-            logger.warning("All chunks are already in the storage")
-            return
+            if len(inserting_chunks) == 0:
+                logger.warning("All text chunks are already in the storage")
+                return
+
+            logger.info("[New Chunks] inserting %d text chunks", len(inserting_chunks))
+            await self.chunks_storage.upsert(inserting_chunks)
+
+            # Step 2.2: Extract entities and relations from text chunks
+            logger.info("[Text Entity and Relation Extraction] processing ...")
+            _add_entities_and_relations = await build_text_kg(
+                llm_client=self.synthesizer_llm_client,
+                kg_instance=self.graph_storage,
+                chunks=[
+                    Chunk(id=k, content=v["content"], type="text")
+                    for k, v in inserting_chunks.items()
+                ],
+                progress_bar=self.progress_bar,
+            )
+            if not _add_entities_and_relations:
+                logger.warning("No entities or relations extracted from text chunks")
+                return
+
+            await self._insert_done()
+            return _add_entities_and_relations
+
+        async def _insert_multi_modal_docs(mm_docs):
+            if len(mm_docs) == 0:
+                logger.warning("No multi-modal documents to insert")
+                return
+
+            logger.info("[New Docs] inserting %d multi-modal docs", len(mm_docs))
+
+            # Step 3.1: Transform multi-modal documents into chunks and filter existing ones
+            inserting_chunks = await chunk_documents(
+                mm_docs,
+                split_config["chunk_size"],
+                split_config["chunk_overlap"],
+                self.tokenizer_instance,
+                self.progress_bar,
+            )
 
-        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
-        await self.full_docs_storage.upsert(new_docs)
-        await self.text_chunks_storage.upsert(inserting_chunks)
-
-        # Step 3: Extract entities and relations from chunks
-        logger.info("[Entity and Relation Extraction]...")
-        _add_entities_and_relations = await build_kg(
-            llm_client=self.synthesizer_llm_client,
-            kg_instance=self.graph_storage,
-            chunks=[
-                Chunk(id=k, content=v["content"]) for k, v in inserting_chunks.items()
-            ],
-            progress_bar=self.progress_bar,
-        )
-        if not _add_entities_and_relations:
-            logger.warning("No entities or relations extracted")
-            return
+            _add_chunk_keys = await self.chunks_storage.filter_keys(
+                list(inserting_chunks.keys())
+            )
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
 
-        await self._insert_done()
-        return _add_entities_and_relations
+            if len(inserting_chunks) == 0:
+                logger.warning("All multi-modal chunks are already in the storage")
+                return
+
+            logger.info(
+                "[New Chunks] inserting %d multimodal chunks", len(inserting_chunks)
+            )
+            await self.chunks_storage.upsert(inserting_chunks)
+
+            # Step 3.2: Extract multi-modal entities and relations from chunks
+            logger.info("[Multi-modal Entity and Relation Extraction] processing ...")
+            _add_entities_and_relations = await build_mm_kg(
+                llm_client=self.synthesizer_llm_client,
+                kg_instance=self.graph_storage,
+                chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()],
+                progress_bar=self.progress_bar,
+            )
+            if not _add_entities_and_relations:
+                logger.warning(
+                    "No entities or relations extracted from multi-modal chunks"
+                )
+                return
+            await self._insert_done()
+            return _add_entities_and_relations
+
+        # Step 2: Insert text documents
+        await _insert_text_docs(new_text_docs)
+        # Step 3: Insert multi-modal documents
+        await _insert_multi_modal_docs(new_mm_docs)
 
     async def _insert_done(self):
         tasks = []
         for storage_instance in [
             self.full_docs_storage,
-            self.text_chunks_storage,
+            self.chunks_storage,
             self.graph_storage,
             self.search_storage,
         ]:
@@ -233,7 +286,10 @@ async def quiz_and_judge(self, quiz_and_judge_config: Dict):
     async def generate(self, partition_config: Dict, generate_config: Dict):
         # Step 1: partition the graph
         batches = await partition_kg(
-            self.graph_storage, self.tokenizer_instance, partition_config
+            self.graph_storage,
+            self.chunks_storage,
+            self.tokenizer_instance,
+            partition_config,
         )
 
         # Step 2： generate QA pairs
@@ -255,7 +311,7 @@ async def generate(self, partition_config: Dict, generate_config: Dict):
     @async_to_sync_method
     async def clear(self):
         await self.full_docs_storage.drop()
-        await self.text_chunks_storage.drop()
+        await self.chunks_storage.drop()
         await self.search_storage.drop()
         await self.graph_storage.clear()
         await self.rephrase_storage.drop()
diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py
index 68944079..37476034 100644
--- a/graphgen/models/__init__.py
+++ b/graphgen/models/__init__.py
@@ -6,10 +6,11 @@
     MultiHopGenerator,
     VQAGenerator,
 )
-from .kg_builder import LightRAGKGBuilder
+from .kg_builder import LightRAGKGBuilder, MMKGBuilder
 from .llm.openai_client import OpenAIClient
 from .llm.topk_token_model import TopkTokenModel
 from .partitioner import (
+    AnchorBFSPartitioner,
     BFSPartitioner,
     DFSPartitioner,
     ECEPartitioner,
diff --git a/graphgen/models/generator/aggregated_generator.py b/graphgen/models/generator/aggregated_generator.py
index 37c54c72..bbf483ee 100644
--- a/graphgen/models/generator/aggregated_generator.py
+++ b/graphgen/models/generator/aggregated_generator.py
@@ -53,7 +53,7 @@ def build_prompt(
         #             ]
         #         )
         prompt = AGGREGATED_GENERATION_PROMPT[language]["ANSWER_REPHRASING"].format(
-            language=language, entities=entities_str, relationships=relations_str
+            entities=entities_str, relationships=relations_str
         )
         return prompt
 
@@ -115,8 +115,8 @@ async def generate(
         question_generation_prompt = self._build_prompt_for_question_generation(context)
         response = await self.llm_client.generate_answer(question_generation_prompt)
         question = self.parse_response(response)["question"]
-        logger.info("Question: %s", question)
-        logger.info("Answer: %s", context)
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", context)
         qa_pairs = {
             compute_content_hash(question): {
                 "question": question,
diff --git a/graphgen/models/generator/atomic_generator.py b/graphgen/models/generator/atomic_generator.py
index cb566fdf..bd152d36 100644
--- a/graphgen/models/generator/atomic_generator.py
+++ b/graphgen/models/generator/atomic_generator.py
@@ -42,8 +42,8 @@ def parse_response(response: str) -> dict:
             return {}
         question = question.strip('"')
         answer = answer.strip('"')
-        logger.info("Question: %s", question)
-        logger.info("Answer: %s", answer)
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", answer)
         return {
             compute_content_hash(question): {
                 "question": question,
diff --git a/graphgen/models/generator/cot_generator.py b/graphgen/models/generator/cot_generator.py
index 2fc4fe85..bd924b78 100644
--- a/graphgen/models/generator/cot_generator.py
+++ b/graphgen/models/generator/cot_generator.py
@@ -85,8 +85,8 @@ def parse_response(response: str) -> dict:
 
         question = question.strip('"')
         reasoning_path = reasoning_path.strip('"')
-        logger.info("CoT Question: %s", question)
-        logger.info("CoT Reasoning Path: %s", reasoning_path)
+        logger.debug("CoT Question: %s", question)
+        logger.debug("CoT Reasoning Path: %s", reasoning_path)
         return {
             "question": question,
             "reasoning_path": reasoning_path,
@@ -110,7 +110,7 @@ async def generate(
         question, reasoning_path = response["question"], response["reasoning_path"]
         prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
         cot_answer = await self.llm_client.generate_answer(prompt)
-        logger.info("CoT Answer: %s", cot_answer)
+        logger.debug("CoT Answer: %s", cot_answer)
         qa_pairs = {
             compute_content_hash(question): {
                 "question": question,
diff --git a/graphgen/models/generator/multi_hop_generator.py b/graphgen/models/generator/multi_hop_generator.py
index 257fc1dd..3fd18244 100644
--- a/graphgen/models/generator/multi_hop_generator.py
+++ b/graphgen/models/generator/multi_hop_generator.py
@@ -45,8 +45,8 @@ def parse_response(response: str) -> dict:
             return {}
         question = question.strip('"')
         answer = answer.strip('"')
-        logger.info("Question: %s", question)
-        logger.info("Answer: %s", answer)
+        logger.debug("Question: %s", question)
+        logger.debug("Answer: %s", answer)
         return {
             compute_content_hash(question): {
                 "question": question,
diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py
index 05d1867b..b0c29d2a 100644
--- a/graphgen/models/generator/vqa_generator.py
+++ b/graphgen/models/generator/vqa_generator.py
@@ -2,6 +2,8 @@
 from typing import Any
 
 from graphgen.bases import BaseGenerator
+from graphgen.templates import VQA_GENERATION_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language, logger
 
 
 @dataclass
@@ -10,14 +12,127 @@ class VQAGenerator(BaseGenerator):
     def build_prompt(
         batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
     ) -> str:
-        raise NotImplementedError(
-            "VQAGenerator.build_prompt is not implemented. "
-            "Please provide an implementation for VQA prompt construction."
+        nodes, edges = batch
+        entities_str = "\n".join(
+            [
+                f"{index + 1}. {node[0]}: {node[1]['description']}"
+                for index, node in enumerate(nodes)
+            ]
         )
 
+        relationships_str = "\n".join(
+            [
+                f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}"
+                for index, edge in enumerate(edges)
+            ]
+        )
+        language = detect_main_language(entities_str + relationships_str)
+        prompt = VQA_GENERATION_PROMPT[language].format(
+            entities=entities_str, relationships=relationships_str
+        )
+        return prompt
+
     @staticmethod
     def parse_response(response: str) -> Any:
-        raise NotImplementedError(
-            "VQAGenerator.parse_response is not implemented. "
-            "Please provide an implementation for VQA response parsing."
-        )
+        """
+        Parse the LLM response and return the generated QAs
+        :param response
+        :return: QA pairs
+        """
+        qa_pairs = {}
+        qa_list = response.strip().split("\n\n")
+        for qa in qa_list:
+            if "Question:" in qa and "Answer:" in qa:
+                question = qa.split("Question:")[1].split("Answer:")[0].strip()
+                answer = qa.split("Answer:")[1].strip()
+            elif "问题：" in qa and "答案：" in qa:
+                question = qa.split("问题：")[1].split("答案：")[0].strip()
+                answer = qa.split("答案：")[1].strip()
+            else:
+                logger.error("Failed to parse QA pair: %s", qa)
+                continue
+            question = question.strip('"')
+            answer = answer.strip('"')
+            logger.debug("Question: %s", question)
+            logger.debug("Answer: %s", answer)
+            qa_pairs[compute_content_hash(question)] = {
+                "question": question,
+                "answer": answer,
+            }
+        return qa_pairs
+
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        qa_pairs = self.parse_response(response)  # generate one or more QA pairs
+        nodes, _ = batch
+        for node in nodes:
+            node_data = node[1]
+            if "images" in node_data and node_data["images"]:
+                img_path = node_data["images"]["img_path"]
+                for qa in qa_pairs.values():
+                    qa["img_path"] = img_path
+        result.update(qa_pairs)
+        return result
+
+    @staticmethod
+    def format_generation_results(
+        results: list[dict], output_data_format: str
+    ) -> list[dict[str, Any]]:
+        if output_data_format == "Alpaca":
+            results = [
+                {
+                    "instruction": v["question"],
+                    "input": "",
+                    "output": v["answer"],
+                    "image": v.get("img_path", ""),
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "Sharegpt":
+            results = [
+                {
+                    "conversations": [
+                        {
+                            "from": "human",
+                            "value": [
+                                {"text": v["question"], "image": v.get("img_path", "")}
+                            ],
+                        },
+                        {"from": "gpt", "value": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "ChatML":
+            results = [
+                {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"text": v["question"], "image": v.get("img_path", "")}
+                            ],
+                        },
+                        {"role": "assistant", "content": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        else:
+            raise ValueError(f"Unknown output data format: {output_data_format}")
+        return results
diff --git a/graphgen/models/kg_builder/__init__.py b/graphgen/models/kg_builder/__init__.py
index 4d630c5f..1e7e2c44 100644
--- a/graphgen/models/kg_builder/__init__.py
+++ b/graphgen/models/kg_builder/__init__.py
@@ -1 +1,2 @@
 from .light_rag_kg_builder import LightRAGKGBuilder
+from .mm_kg_builder import MMKGBuilder
diff --git a/graphgen/models/kg_builder/light_rag_kg_builder.py b/graphgen/models/kg_builder/light_rag_kg_builder.py
index abbccf07..e734eca6 100644
--- a/graphgen/models/kg_builder/light_rag_kg_builder.py
+++ b/graphgen/models/kg_builder/light_rag_kg_builder.py
@@ -6,7 +6,6 @@
 from graphgen.bases import BaseGraphStorage, BaseKGBuilder, BaseLLMClient, Chunk
 from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
 from graphgen.utils import (
-    detect_if_chinese,
     detect_main_language,
     handle_single_entity_extraction,
     handle_single_relationship_extraction,
@@ -33,8 +32,7 @@ async def extract(
         content = chunk.content
 
         # step 1: language_detection
-        language = "Chinese" if detect_if_chinese(content) else "English"
-        KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
+        language = detect_main_language(content)
 
         hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
             **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
@@ -42,7 +40,7 @@ async def extract(
 
         # step 2: initial glean
         final_result = await self.llm_client.generate_answer(hint_prompt)
-        logger.info("First extraction result: %s", final_result)
+        logger.debug("First extraction result: %s", final_result)
 
         # step3: iterative refinement
         history = pack_history_conversations(hint_prompt, final_result)
@@ -57,7 +55,7 @@ async def extract(
             glean_result = await self.llm_client.generate_answer(
                 text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
             )
-            logger.info("Loop %s glean: %s", loop_idx + 1, glean_result)
+            logger.debug("Loop %s glean: %s", loop_idx + 1, glean_result)
 
             history += pack_history_conversations(
                 KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
@@ -201,11 +199,6 @@ async def _handle_kg_summary(
 
         tokenizer_instance = self.llm_client.tokenizer
         language = detect_main_language(description)
-        if language == "en":
-            language = "English"
-        else:
-            language = "Chinese"
-        KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
 
         tokens = tokenizer_instance.encode(description)
         if len(tokens) < max_summary_tokens:
diff --git a/graphgen/models/kg_builder/mm_kg_builder.py b/graphgen/models/kg_builder/mm_kg_builder.py
new file mode 100644
index 00000000..c5547291
--- /dev/null
+++ b/graphgen/models/kg_builder/mm_kg_builder.py
@@ -0,0 +1,93 @@
+import re
+from collections import defaultdict
+from typing import Dict, List, Tuple
+
+from graphgen.bases import BaseLLMClient, Chunk
+from graphgen.templates import MMKG_EXTRACTION_PROMPT
+from graphgen.utils import (
+    detect_main_language,
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    logger,
+    split_string_by_multi_markers,
+)
+
+from .light_rag_kg_builder import LightRAGKGBuilder
+
+
+class MMKGBuilder(LightRAGKGBuilder):
+    llm_client: BaseLLMClient = None
+
+    async def extract(
+        self, chunk: Chunk
+    ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]:
+        """
+        Extract entities and relationships from a single multi-modal chunk using the LLM client.
+        Expect to get a mini graph which contains a central multi-modal entity
+        and its related text entities and relationships.
+        Like:
+        (image: "image_of_eiffel_tower") --[located_in]--> (text: "Paris")
+        (image: "image_of_eiffel_tower") --[built_in]--> (text: "1889")
+        (text: "Eiffel Tower") --[height]--> (text: "324 meters")
+        :param chunk
+        """
+        chunk_id = chunk.id
+        chunk_type = chunk.type  # image | table | formula | ...
+        metadata = chunk.metadata
+
+        # choose different extraction strategies based on chunk type
+        if chunk_type == "image":
+            image_caption = "\n".join(metadata.get("image_caption", ""))
+            language = detect_main_language(image_caption)
+            prompt_template = MMKG_EXTRACTION_PROMPT[language].format(
+                **MMKG_EXTRACTION_PROMPT["FORMAT"],
+                chunk_type=chunk_type,
+                chunk_id=chunk_id,
+                chunk_text=image_caption,
+            )
+            result = await self.llm_client.generate_answer(prompt_template)
+            logger.debug("Image chunk extraction result: %s", result)
+
+            # parse the result
+            records = split_string_by_multi_markers(
+                result,
+                [
+                    MMKG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                    MMKG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+                ],
+            )
+
+            nodes = defaultdict(list)
+            edges = defaultdict(list)
+
+            for record in records:
+                match = re.search(r"\((.*)\)", record)
+                if not match:
+                    continue
+                inner = match.group(1)
+
+                attributes = split_string_by_multi_markers(
+                    inner, [MMKG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
+                )
+
+                entity = await handle_single_entity_extraction(attributes, chunk_id)
+                if entity is not None:
+                    nodes[entity["entity_name"]].append(entity)
+                    continue
+
+                relation = await handle_single_relationship_extraction(
+                    attributes, chunk_id
+                )
+                if relation is not None:
+                    key = (relation["src_id"], relation["tgt_id"])
+                    edges[key].append(relation)
+
+            return dict(nodes), dict(edges)
+
+        if chunk_type == "table":
+            pass  # TODO: implement table-based entity and relationship extraction
+        if chunk_type == "formula":
+            pass  # TODO: implement formula-based entity and relationship extraction
+
+        logger.error("Unsupported chunk type for MMKGBuilder: %s", chunk_type)
+        return defaultdict(list), defaultdict(list)
diff --git a/graphgen/models/partitioner/__init__.py b/graphgen/models/partitioner/__init__.py
index 9d37a5d4..2e1bcb68 100644
--- a/graphgen/models/partitioner/__init__.py
+++ b/graphgen/models/partitioner/__init__.py
@@ -1,3 +1,4 @@
+from .anchor_bfs_partitioner import AnchorBFSPartitioner
 from .bfs_partitioner import BFSPartitioner
 from .dfs_partitioner import DFSPartitioner
 from .ece_partitioner import ECEPartitioner
diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py
new file mode 100644
index 00000000..b6248d43
--- /dev/null
+++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py
@@ -0,0 +1,128 @@
+import random
+from collections import deque
+from typing import Any, List, Literal, Set, Tuple
+
+from graphgen.bases import BaseGraphStorage
+from graphgen.bases.datatypes import Community
+
+from .bfs_partitioner import BFSPartitioner
+
+NODE_UNIT: str = "n"
+EDGE_UNIT: str = "e"
+
+
+class AnchorBFSPartitioner(BFSPartitioner):
+    """
+    Anchor BFS partitioner that partitions the graph into communities of a fixed size.
+    1. Randomly choose a node of a specified type as the anchor.
+    2. Expand the community using BFS until the max unit size is reached.(A unit is a node or an edge.)
+    3. Non-anchor units can only be "pulled" into a community and never become seeds themselves.
+    For example, for VQA tasks, we may want to use image nodes as anchors and expand to nearby text nodes and edges.
+    """
+
+    def __init__(
+        self,
+        *,
+        anchor_type: Literal["image"] = "image",
+        anchor_ids: Set[str] | None = None,
+    ) -> None:
+        super().__init__()
+        self.anchor_type = anchor_type
+        self.anchor_ids = anchor_ids
+
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        max_units_per_community: int = 1,
+        **kwargs: Any,
+    ) -> List[Community]:
+        nodes = await g.get_all_nodes()  # List[tuple[id, meta]]
+        edges = await g.get_all_edges()  # List[tuple[u, v, meta]]
+
+        adj, _ = self._build_adjacency_list(nodes, edges)
+
+        anchors: Set[str] = await self._pick_anchor_ids(nodes)
+        if not anchors:
+            return []  # if no anchors, return empty list
+
+        used_n: set[str] = set()
+        used_e: set[frozenset[str]] = set()
+        communities: List[Community] = []
+
+        seeds = list(anchors)
+        random.shuffle(seeds)
+
+        for seed_node in seeds:
+            if seed_node in used_n:
+                continue
+            comm_n, comm_e = await self._grow_community(
+                seed_node, adj, max_units_per_community, used_n, used_e
+            )
+            if comm_n or comm_e:
+                communities.append(
+                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
+                )
+
+        return communities
+
+    async def _pick_anchor_ids(
+        self,
+        nodes: List[tuple[str, dict]],
+    ) -> Set[str]:
+        if self.anchor_ids is not None:
+            return self.anchor_ids
+
+        anchor_ids: Set[str] = set()
+        for node_id, meta in nodes:
+            node_type = str(meta.get("entity_type", "")).lower()
+            if self.anchor_type.lower() in node_type:
+                anchor_ids.add(node_id)
+        return anchor_ids
+
+    @staticmethod
+    async def _grow_community(
+        seed: str,
+        adj: dict[str, List[str]],
+        max_units: int,
+        used_n: set[str],
+        used_e: set[frozenset[str]],
+    ) -> Tuple[List[str], List[Tuple[str, str]]]:
+        """
+        Grow a community from the seed node using BFS.
+        :param seed: seed node id
+        :param adj: adjacency list
+        :param max_units: maximum number of units (nodes + edges) in the community
+        :param used_n: set of used node ids
+        :param used_e: set of used edge keys
+        :return: (list of node ids, list of edge tuples)
+        """
+        comm_n: List[str] = []
+        comm_e: List[Tuple[str, str]] = []
+        queue: deque[tuple[str, Any]] = deque([(NODE_UNIT, seed)])
+        cnt = 0
+
+        while queue and cnt < max_units:
+            k, it = queue.popleft()
+
+            if k == NODE_UNIT:
+                if it in used_n:
+                    continue
+                used_n.add(it)
+                comm_n.append(it)
+                cnt += 1
+                for nei in adj[it]:
+                    e_key = frozenset((it, nei))
+                    if e_key not in used_e:
+                        queue.append((EDGE_UNIT, e_key))
+            else:  # EDGE_UNIT
+                if it in used_e:
+                    continue
+                used_e.add(it)
+                u, v = it
+                comm_e.append((u, v))
+                cnt += 1
+                for n in it:
+                    if n not in used_n:
+                        queue.append((NODE_UNIT, n))
+
+        return comm_n, comm_e
diff --git a/graphgen/models/reader/csv_reader.py b/graphgen/models/reader/csv_reader.py
index 555a245f..97b26c68 100644
--- a/graphgen/models/reader/csv_reader.py
+++ b/graphgen/models/reader/csv_reader.py
@@ -9,6 +9,9 @@ class CSVReader(BaseReader):
     def read(self, file_path: str) -> List[Dict[str, Any]]:
 
         df = pd.read_csv(file_path)
-        if self.text_column not in df.columns:
-            raise ValueError(f"Missing '{self.text_column}' column in CSV file.")
-        return df.to_dict(orient="records")
+        for _, row in df.iterrows():
+            if "type" in row and row["type"] == "text" and self.text_column not in row:
+                raise ValueError(
+                    f"Missing '{self.text_column}' in document: {row.to_dict()}"
+                )
+        return self.filter(df.to_dict(orient="records"))
diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py
index 932dd01c..943fbcab 100644
--- a/graphgen/models/reader/json_reader.py
+++ b/graphgen/models/reader/json_reader.py
@@ -10,9 +10,9 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
             data = json.load(f)
             if isinstance(data, list):
                 for doc in data:
-                    if self.text_column not in doc:
+                    if doc.get("type") == "text" and self.text_column not in doc:
                         raise ValueError(
                             f"Missing '{self.text_column}' in document: {doc}"
                         )
-                return data
+                return self.filter(data)
             raise ValueError("JSON file must contain a list of documents.")
diff --git a/graphgen/models/reader/jsonl_reader.py b/graphgen/models/reader/jsonl_reader.py
index 744ed39e..be9f1cca 100644
--- a/graphgen/models/reader/jsonl_reader.py
+++ b/graphgen/models/reader/jsonl_reader.py
@@ -12,12 +12,11 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
             for line in f:
                 try:
                     doc = json.loads(line)
-                    if self.text_column in doc:
-                        docs.append(doc)
-                    else:
+                    if doc.get("type") == "text" and self.text_column not in doc:
                         raise ValueError(
                             f"Missing '{self.text_column}' in document: {doc}"
                         )
+                    docs.append(doc)
                 except json.JSONDecodeError as e:
                     logger.error("Error decoding JSON line: %s. Error: %s", line, e)
-        return docs
+        return self.filter(docs)
diff --git a/graphgen/models/reader/pdf_reader.py b/graphgen/models/reader/pdf_reader.py
index c8ca13c1..94562cb5 100644
--- a/graphgen/models/reader/pdf_reader.py
+++ b/graphgen/models/reader/pdf_reader.py
@@ -74,7 +74,7 @@ def read(self, file_path: str, **override) -> List[Dict[str, Any]]:
         kwargs = {**self._default_kwargs, **override}
 
         mineru_result = self._call_mineru(pdf_path, kwargs)
-        return mineru_result
+        return self.filter(mineru_result)
 
     def _call_mineru(
         self, pdf_path: Path, kwargs: Dict[str, Any]
@@ -172,8 +172,6 @@ def _try_load_cached_result(
             for key in ("page_idx", "bbox", "text_level"):
                 if item.get(key) is not None:
                     del item[key]
-            if item["type"] == "text" and not item["content"].strip():
-                continue
             results.append(item)
         return results
 
diff --git a/graphgen/models/reader/txt_reader.py b/graphgen/models/reader/txt_reader.py
index 5d30703b..e0a9e5c0 100644
--- a/graphgen/models/reader/txt_reader.py
+++ b/graphgen/models/reader/txt_reader.py
@@ -11,4 +11,4 @@ def read(self, file_path: str) -> List[Dict[str, Any]]:
                 line = line.strip()
                 if line:
                     docs.append({self.text_column: line})
-        return docs
+        return self.filter(docs)
diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py
index 88c31497..2ad37e63 100644
--- a/graphgen/operators/__init__.py
+++ b/graphgen/operators/__init__.py
@@ -1,4 +1,4 @@
-from .build_kg import build_kg
+from .build_kg import build_mm_kg, build_text_kg
 from .generate import generate_qas
 from .judge import judge_statement
 from .partition import partition_kg
diff --git a/graphgen/operators/build_kg/__init__.py b/graphgen/operators/build_kg/__init__.py
index 18766fe6..70dac51b 100644
--- a/graphgen/operators/build_kg/__init__.py
+++ b/graphgen/operators/build_kg/__init__.py
@@ -1 +1,2 @@
-from .build_kg import build_kg
+from .build_mm_kg import build_mm_kg
+from .build_text_kg import build_text_kg
diff --git a/graphgen/operators/build_kg/build_mm_kg.py b/graphgen/operators/build_kg/build_mm_kg.py
new file mode 100644
index 00000000..9301c2b9
--- /dev/null
+++ b/graphgen/operators/build_kg/build_mm_kg.py
@@ -0,0 +1,56 @@
+from collections import defaultdict
+from typing import List
+
+import gradio as gr
+
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Chunk
+from graphgen.models import MMKGBuilder, OpenAIClient
+from graphgen.utils import run_concurrent
+
+
+async def build_mm_kg(
+    llm_client: OpenAIClient,
+    kg_instance: BaseGraphStorage,
+    chunks: List[Chunk],
+    progress_bar: gr.Progress = None,
+):
+    """
+    Build multi-modal KG and merge into kg_instance
+    :param llm_client: Synthesizer LLM model to extract entities and relationships
+    :param kg_instance
+    :param chunks
+    :param progress_bar: Gradio progress bar to show the progress of the extraction
+    :return:
+    """
+    mm_builder = MMKGBuilder(llm_client=llm_client)
+
+    results = await run_concurrent(
+        mm_builder.extract,
+        chunks,
+        desc="[2/4] Extracting entities and relationships from multi-modal chunks",
+        unit="chunk",
+        progress_bar=progress_bar,
+    )
+
+    nodes = defaultdict(list)
+    edges = defaultdict(list)
+    for n, e in results:
+        for k, v in n.items():
+            nodes[k].extend(v)
+        for k, v in e.items():
+            edges[tuple(sorted(k))].extend(v)
+
+    await run_concurrent(
+        lambda kv: mm_builder.merge_nodes(kv, kg_instance=kg_instance),
+        list(nodes.items()),
+        desc="Inserting entities into storage",
+    )
+
+    await run_concurrent(
+        lambda kv: mm_builder.merge_edges(kv, kg_instance=kg_instance),
+        list(edges.items()),
+        desc="Inserting relationships into storage",
+    )
+
+    return kg_instance
diff --git a/graphgen/operators/build_kg/build_kg.py b/graphgen/operators/build_kg/build_text_kg.py
similarity index 98%
rename from graphgen/operators/build_kg/build_kg.py
rename to graphgen/operators/build_kg/build_text_kg.py
index fdc90626..3babe2e5 100644
--- a/graphgen/operators/build_kg/build_kg.py
+++ b/graphgen/operators/build_kg/build_text_kg.py
@@ -9,7 +9,7 @@
 from graphgen.utils import run_concurrent
 
 
-async def build_kg(
+async def build_text_kg(
     llm_client: OpenAIClient,
     kg_instance: BaseGraphStorage,
     chunks: List[Chunk],
diff --git a/graphgen/operators/judge.py b/graphgen/operators/judge.py
index d1b0e86e..f7b0b963 100644
--- a/graphgen/operators/judge.py
+++ b/graphgen/operators/judge.py
@@ -37,7 +37,7 @@ async def _judge_single_relation(
             edge_data = edge[2]
 
             if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
-                logger.info(
+                logger.debug(
                     "Edge %s -> %s already judged, loss: %s, skip",
                     source_id,
                     target_id,
@@ -63,7 +63,7 @@ async def _judge_single_relation(
 
                 loss = yes_no_loss_entropy(judgements, gts)
 
-                logger.info(
+                logger.debug(
                     "Edge %s -> %s description: %s loss: %s",
                     source_id,
                     target_id,
@@ -100,7 +100,7 @@ async def _judge_single_entity(
             node_data = node[1]
 
             if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
-                logger.info(
+                logger.debug(
                     "Node %s already judged, loss: %s, skip", node_id, node_data["loss"]
                 )
                 return node_id, node_data
@@ -123,14 +123,14 @@ async def _judge_single_entity(
 
                 loss = yes_no_loss_entropy(judgements, gts)
 
-                logger.info(
+                logger.debug(
                     "Node %s description: %s loss: %s", node_id, description, loss
                 )
 
                 node_data["loss"] = loss
             except Exception as e:  # pylint: disable=broad-except
                 logger.error("Error in judging entity %s: %s", node_id, e)
-                logger.info("Use default loss 0.1")
+                logger.error("Use default loss 0.1")
                 node_data["loss"] = -math.log(0.1)
 
             await graph_storage.update_node(node_id, node_data)
diff --git a/graphgen/operators/partition/partition_kg.py b/graphgen/operators/partition/partition_kg.py
index b03d3221..817ebe27 100644
--- a/graphgen/operators/partition/partition_kg.py
+++ b/graphgen/operators/partition/partition_kg.py
@@ -1,7 +1,8 @@
 from typing import Any
 
-from graphgen.bases import BaseGraphStorage, BaseTokenizer
+from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseTokenizer
 from graphgen.models import (
+    AnchorBFSPartitioner,
     BFSPartitioner,
     DFSPartitioner,
     ECEPartitioner,
@@ -14,6 +15,7 @@
 
 async def partition_kg(
     kg_instance: BaseGraphStorage,
+    chunk_storage: BaseKVStorage,
     tokenizer: Any = BaseTokenizer,
     partition_config: dict = None,
 ) -> list[
@@ -39,10 +41,28 @@ async def partition_kg(
     elif method == "leiden":
         logger.info("Partitioning knowledge graph using Leiden method.")
         partitioner = LeidenPartitioner()
+    elif method == "anchor_bfs":
+        logger.info("Partitioning knowledge graph using Anchor BFS method.")
+        partitioner = AnchorBFSPartitioner(
+            anchor_type=method_params.get("anchor_type"),
+            anchor_ids=set(method_params.get("anchor_ids", []))
+            if method_params.get("anchor_ids")
+            else None,
+        )
     else:
         raise ValueError(f"Unsupported partition method: {method}")
 
     communities = await partitioner.partition(g=kg_instance, **method_params)
     logger.info("Partitioned the graph into %d communities.", len(communities))
     batches = await partitioner.community2batch(communities, g=kg_instance)
+
+    for _, batch in enumerate(batches):
+        nodes, edges = batch
+        for node_id, node_data in nodes:
+            entity_type = node_data.get("entity_type")
+            if entity_type and "image" in entity_type.lower():
+                node_id = node_id.strip('"').lower()
+                image_data = await chunk_storage.get_by_id(node_id)
+                if image_data:
+                    node_data["images"] = image_data
     return batches
diff --git a/graphgen/operators/split/split_chunks.py b/graphgen/operators/split/split_chunks.py
index caba96a3..e400ea63 100644
--- a/graphgen/operators/split/split_chunks.py
+++ b/graphgen/operators/split/split_chunks.py
@@ -48,25 +48,31 @@ async def chunk_documents(
     async for doc_key, doc in tqdm_async(
         new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
     ):
-        doc_language = detect_main_language(doc["content"])
-        text_chunks = split_chunks(
-            doc["content"],
-            language=doc_language,
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-        )
+        doc_type = doc.get("type")
+        if doc_type == "text":
+            doc_language = detect_main_language(doc["content"])
+            text_chunks = split_chunks(
+                doc["content"],
+                language=doc_language,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+            )
 
-        chunks = {
-            compute_content_hash(txt, prefix="chunk-"): {
-                "content": txt,
-                "full_doc_id": doc_key,
-                "length": len(tokenizer_instance.encode(txt))
-                if tokenizer_instance
-                else len(txt),
-                "language": doc_language,
+            chunks = {
+                compute_content_hash(txt, prefix="chunk-"): {
+                    "content": txt,
+                    "type": "text",
+                    "full_doc_id": doc_key,
+                    "length": len(tokenizer_instance.encode(txt))
+                    if tokenizer_instance
+                    else len(txt),
+                    "language": doc_language,
+                }
+                for txt in text_chunks
             }
-            for txt in text_chunks
-        }
+        else:
+            chunks = {doc_key.replace("doc-", f"{doc_type}-"): {**doc}}
+
         inserting_chunks.update(chunks)
 
         if progress_bar is not None:
diff --git a/graphgen/templates/__init__.py b/graphgen/templates/__init__.py
index 8f764cc0..ea28c4d0 100644
--- a/graphgen/templates/__init__.py
+++ b/graphgen/templates/__init__.py
@@ -5,9 +5,9 @@
     ATOMIC_GENERATION_PROMPT,
     COT_GENERATION_PROMPT,
     MULTI_HOP_GENERATION_PROMPT,
+    VQA_GENERATION_PROMPT,
 )
-from .kg_extraction import KG_EXTRACTION_PROMPT
-from .kg_summarization import KG_SUMMARIZATION_PROMPT
+from .kg import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT, MMKG_EXTRACTION_PROMPT
 from .question_generation import QUESTION_GENERATION_PROMPT
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
diff --git a/graphgen/templates/generation/__init__.py b/graphgen/templates/generation/__init__.py
index 1b624d3b..b58c2b6c 100644
--- a/graphgen/templates/generation/__init__.py
+++ b/graphgen/templates/generation/__init__.py
@@ -2,3 +2,4 @@
 from .atomic_generation import ATOMIC_GENERATION_PROMPT
 from .cot_generation import COT_GENERATION_PROMPT
 from .multi_hop_generation import MULTI_HOP_GENERATION_PROMPT
+from .vqa_generation import VQA_GENERATION_PROMPT
diff --git a/graphgen/templates/generation/aggregated_generation.py b/graphgen/templates/generation/aggregated_generation.py
index 9e1bfac8..305064e7 100644
--- a/graphgen/templates/generation/aggregated_generation.py
+++ b/graphgen/templates/generation/aggregated_generation.py
@@ -1,7 +1,7 @@
 # pylint: disable=C0301
 ANSWER_REPHRASING_CONTEXT_EN: str = """---Role---
 You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below. You may refer to the original text to assist in generating the rephrased version, but ensure that the final output text meets the requirements.
-Use {language} as output language.
+Use English as output language.
 
 ---Goal---
 To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
@@ -52,7 +52,7 @@
 
 ANSWER_REPHRASING_CONTEXT_ZH: str = """---角色---
 你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。你可以参考原始文本辅助生成，但需要确保最终输出的文本符合要求。
-使用{language}作为输出语言。
+使用中文作为输出语言。
 
 ---目标---
 生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：
@@ -100,7 +100,7 @@
 
 ANSWER_REPHRASING_EN: str = """---Role---
 You are an NLP expert responsible for generating a logically structured and coherent rephrased version of the TEXT based on ENTITIES and RELATIONSHIPS provided below.
-Use {language} as output language.
+Use English as output language.
 
 ---Goal---
 To generate a version of the text that is rephrased and conveys the same meaning as the original entity and relationship descriptions, while:
@@ -146,7 +146,7 @@
 
 ANSWER_REPHRASING_ZH: str = """---角色---
 你是一位NLP专家，负责根据下面提供的实体和关系生成逻辑结构清晰且连贯的文本重述版本。
-使用{language}作为输出语言。
+使用中文作为输出语言。
 
 ---目标---
 生成文本的重述版本，使其传达与原始实体和关系描述相同的含义，同时：
diff --git a/graphgen/templates/generation/vqa_generation.py b/graphgen/templates/generation/vqa_generation.py
new file mode 100644
index 00000000..4826be0e
--- /dev/null
+++ b/graphgen/templates/generation/vqa_generation.py
@@ -0,0 +1,104 @@
+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are a senior VQA data engineer. Your task is to generate logically coherent, verifiable and non-hallucinated question-answer pairs for the given multi-modal samples.
+Use English as the output language.
+
+---Objectives---
+Create multiple sets of VQA question-answer pairs that satisfy the following:
+1. Only ask about objectively existing facts in the given data, avoiding subjective or ambiguous questions.
+2. Ensure that each question has a clear and verifiable answer, avoiding questions with no answer or uncertainty.
+3. Questions should cover various aspects of both image and text content, ensuring diversity and comprehensiveness.
+4. Avoid repetitive questions, ensuring that each question is unique and meaningful.
+5. Use clear and concise language, avoiding complex or ambiguous wording.
+
+---Instructions---
+1. Carefully analyze the provided entities and relationships to identify:
+    - Key concepts and their hierarchical relationships
+    - Temporal sequences and time order
+    - Cause-and-effect relationships
+    - Dependencies between different elements
+2. Organize the information into a logical sequence by:
+    - Starting with foundational concepts
+    - Gradually building up to more complex relationships
+    - Grouping related ideas together
+    - Creating clear transitions between sections
+3. Maintain the following when generating question-answer pairs:
+    - Logical flow
+    - Clear connections between concepts
+    - Appropriate context and background
+    - Coherent narrative structure
+4. Review and refine the question-answer pairs to ensure:
+    - Overall logical consistency
+    - Clear cause-and-effect relationships
+
+################
+-Entities-
+################
+{entities}
+################
+-Relationships-
+################
+{relationships}
+################
+Directly output the generated questions and answers, please do not directly copy the example questions and answers, and do not provide irrelevant information.
+Here is the response format you should follow:
+Question: <Question1>
+Answer: <Answer1>
+
+Question: <Question2>
+Answer: <Answer2>
+
+"""
+
+TEMPLATE_ZH: str = """---角色---
+你是一位资深 VQA 数据工程师。你需要为给定的多模态样本生成逻辑连贯、可验证、无幻觉的问答对。
+使用中文作为输出语言。
+
+---目标---
+创建多组 VQA 问答对，满足：
+1. 仅询问给定数据中客观存在的事实，避免主观或模糊的问题。
+2. 确保每个问题都有明确且可验证的答案，避免无答案或不确定的问题。
+3. 问题应涵盖图像和文本内容的各个方面，确保多样性和全面性。
+4. 避免重复问题，确保每个问题都是独特且有意义的。
+5. 使用清晰简洁的语言，避免复杂或含糊的措辞。
+
+---说明---
+1. 仔细分析提供的实体和关系，以识别：
+    - 关键概念及其层级关系
+    - 时间序列和时间顺序
+    - 因果关系
+    - 不同元素之间的依赖关系
+2. 通过以下方式将信息组织成逻辑顺序：
+    - 从基础概念开始
+    - 逐步建立更复杂的关系
+    - 将相关的想法分组在一起
+    - 在各部分之间创建清晰的过渡
+3. 生成问答对时保持：
+    - 逻辑流畅
+    - 概念之间的清晰联系
+    - 适当的上下文和背景
+    - 连贯的叙述结构
+4. 检查和完善问答对以确保：
+    - 整体逻辑一致性
+    - 清晰的因果关系
+
+################
+-实体-
+################
+{entities}
+
+################
+-关系-
+################
+{relationships}
+################
+直接输出生成的问题和答案，请不要直接复制示例问题和答案，不要输出无关内容。
+以下是你应该遵循的响应格式：
+问题： <问题1>
+答案： <答案1>
+
+问题： <问题2>
+答案： <答案2>
+
+"""
+
+VQA_GENERATION_PROMPT = {"en": TEMPLATE_EN, "zh": TEMPLATE_ZH}
diff --git a/graphgen/templates/kg/__init__.py b/graphgen/templates/kg/__init__.py
new file mode 100644
index 00000000..ea865ce6
--- /dev/null
+++ b/graphgen/templates/kg/__init__.py
@@ -0,0 +1,3 @@
+from .kg_extraction import KG_EXTRACTION_PROMPT
+from .kg_summarization import KG_SUMMARIZATION_PROMPT
+from .mm_kg_extraction import MMKG_EXTRACTION_PROMPT
diff --git a/graphgen/templates/kg_extraction.py b/graphgen/templates/kg/kg_extraction.py
similarity index 98%
rename from graphgen/templates/kg_extraction.py
rename to graphgen/templates/kg/kg_extraction.py
index 8d98bb95..3b8daf41 100644
--- a/graphgen/templates/kg_extraction.py
+++ b/graphgen/templates/kg/kg_extraction.py
@@ -1,10 +1,9 @@
 # pylint: disable=C0301
-
 TEMPLATE_EN: str = """You are an NLP expert, skilled at analyzing text to extract named entities and their relationships.
 
 -Goal-
 Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
-Use {language} as output language.
+Use English as output language.
 
 -Steps-
 1. Identify all entities. For each identified entity, extract the following information:
@@ -23,7 +22,7 @@
 3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
 Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
 
-4. Return output in {language} as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
+4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
 
 5. When finished, output {completion_delimiter}
 
@@ -85,7 +84,7 @@
 
 -目标-
 给定一个实体类型列表和可能与列表相关的文本，从文本中识别所有这些类型的实体，以及这些实体之间所有的关系。
-使用{language}作为输出语言。
+使用中文作为输出语言。
 
 -步骤-
 1. 识别所有实体。对于每个识别的实体，提取以下信息：
@@ -189,12 +188,12 @@
 IF_LOOP_ZH: str = """看起来可能仍然遗漏了一些实体和关系。如果仍有实体和关系需要添加，请回答YES | NO。"""
 
 KG_EXTRACTION_PROMPT: dict = {
-    "English": {
+    "en": {
         "TEMPLATE": TEMPLATE_EN,
         "CONTINUE": CONTINUE_EN,
         "IF_LOOP": IF_LOOP_EN,
     },
-    "Chinese": {
+    "zh": {
         "TEMPLATE": TEMPLATE_ZH,
         "CONTINUE": CONTINUE_ZH,
         "IF_LOOP": IF_LOOP_ZH,
@@ -205,6 +204,5 @@
         "completion_delimiter": "<|COMPLETE|>",
         "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \
 science, technology, mission, gene",
-        "language": "English",
     },
 }
diff --git a/graphgen/templates/kg_summarization.py b/graphgen/templates/kg/kg_summarization.py
similarity index 86%
rename from graphgen/templates/kg_summarization.py
rename to graphgen/templates/kg/kg_summarization.py
index 7cf75180..3e7cb6cb 100644
--- a/graphgen/templates/kg_summarization.py
+++ b/graphgen/templates/kg/kg_summarization.py
@@ -3,7 +3,7 @@
 Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
 If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
 Make sure it is written in third person, and include the entity names so we the have full context.
-Use {language} as output language.
+Use English as output language.
 
 #######
 -Data-
@@ -18,7 +18,7 @@
 请将所有这些描述整合成一个综合描述。确保包含所有描述中收集的信息。
 如果提供的描述是矛盾的，请解决这些矛盾并提供一个连贯的总结。
 确保以第三人称写作，并包含实体名称，以便我们有完整的上下文。
-使用{language}作为输出语言。
+使用中文作为输出语言。
 
 #######
 -数据-
@@ -30,14 +30,9 @@
 
 
 KG_SUMMARIZATION_PROMPT = {
-    "Chinese": {
-        "TEMPLATE": TEMPLATE_ZH
-    },
-    "English": {
-        "TEMPLATE": TEMPLATE_EN
-    },
+    "zh": {"TEMPLATE": TEMPLATE_ZH},
+    "en": {"TEMPLATE": TEMPLATE_EN},
     "FORMAT": {
-        "language": "English",
         "tuple_delimiter": "<|>",
         "record_delimiter": "##",
         "completion_delimiter": "<|COMPLETE|>",
diff --git a/graphgen/templates/kg/mm_kg_extraction.py b/graphgen/templates/kg/mm_kg_extraction.py
new file mode 100644
index 00000000..2805b98d
--- /dev/null
+++ b/graphgen/templates/kg/mm_kg_extraction.py
@@ -0,0 +1,131 @@
+# pylint: disable=C0301
+TEMPLATE_EN: str = """You are an expert in multi-modal data analysis and knowledge graph construction. Your task is to extract named entities and relationships from a given multi-modal data chunk and its accompanying text.
+
+-Objective-
+Given a multi-modal data chunk (e.g., image, table, formula, etc. + accompanying text), construct a knowledge graph centered around the "central multi-modal entity":
+- The central entity must be the image/table/formula itself (e.g., image-c71ef797e99af81047fbc7509609c765).
+- Related entities and relationships must be extracted from the accompanying text.
+- Only retain edges directly connected to the central entity, forming a star-shaped graph.
+Use English as the output language.
+
+-Steps-
+1. Identify the unique central multi-modal entity and recognize all text entities directly related to the central entity from the accompanying text.
+    For the central entity, extract the following information:
+    - entity_name: Use the unique identifier of the data chunk (e.g., image-c71ef797e99af81047fbc7509609c765).
+    - entity_type: Label according to the type of data chunk (image, table, formula, etc.).
+    - entity_summary: A brief description of the content of the data chunk and its role in the accompanying text.
+    For each entity recognized from the accompanying text, extract the following information:
+    - entity_name: The name of the entity, capitalized
+    - entity_type: One of the following types: [{entity_types}]
+    - entity_summary: A comprehensive summary of the entity's attributes and activities
+    Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+
+2. From the entities identified in Step 1, recognize all (source_entity, target_entity) pairs that are *obviously related* to each other.
+    For each pair of related entities, extract the following information:
+    - source_entity: The name of the source entity identified in Step 1
+    - target_entity: The name of the target entity identified in Step 1
+    - relationship_summary: Explain why you think the source entity and target entity are related to each other
+    Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+    
+3. Return the output list of all entities and relationships identified in Steps 1 and 2 in English. Use **{record_delimiter}** as the list separator.
+
+4. Upon completion, output {completion_delimiter}
+
+################
+-Example-
+################
+Multi-modal data chunk type: image
+Multi-modal data chunk unique identifier: image-c71ef797e99af81047fbc7509609c765
+Accompanying text: The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889. It stands 324 meters tall and is one of the tallest structures in the world. The Eiffel Tower is located on the banks of the Seine River and attracts millions of visitors each year. It is not only an engineering marvel but also an important symbol of French culture.
+################
+Output:
+("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"This is an image showcasing the iconic structure in Paris, France, the Eiffel Tower, highlighting its full height of 324 meters along with the riverside scenery, symbolizing both engineering and cultural significance"){record_delimiter}
+("entity"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"landmark"{tuple_delimiter}"The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889, standing 324 meters tall, located on the banks of the Seine River, attracting millions of visitors each year"){record_delimiter}
+("entity"{tuple_delimiter}"Paris, France"{tuple_delimiter}"location"{tuple_delimiter}"Paris, France is the capital of France, known for its rich historical and cultural heritage and as the location of the Eiffel Tower"){record_delimiter}
+("entity"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"person"{tuple_delimiter}"Gustave Eiffel is a renowned French engineer who designed and built the Eiffel Tower"){record_delimiter}
+("entity"{tuple_delimiter}"Seine River"{tuple_delimiter}"location"{tuple_delimiter}"The Seine River is a major river flowing through Paris, France, with the Eiffel Tower located on its banks"){completion_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"The image showcases the iconic structure, the Eiffel Tower"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Paris, France"{tuple_delimiter}"The image's background is Paris, France, highlighting the geographical location of the Eiffel Tower"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"The Eiffel Tower in the image was designed by Gustave Eiffel"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Seine River"{tuple_delimiter}"The image showcases the scenery of the Eiffel Tower located on the banks of the Seine River"){completion_delimiter}
+################
+
+-Real Data-
+Multi-modal data chunk type: {chunk_type}
+Multi-modal data chunk unique identifier: {chunk_id}
+Accompanying text: {chunk_text}
+################
+Output:
+"""
+
+TEMPLATE_ZH: str = """你是一个多模态数据分析和知识图谱构建专家。你的任务是从给定的多模态数据块及其伴随文本中抽取命名实体与关系。
+
+-目标-
+给定一个多模态数据块（例如图像、表格、公式等 + 伴随文本），构建以「中心多模态实体」为核心的知识图：
+- 中心实体必须是图像/表格/公式本身（如 image-c71ef797e99af81047fbc7509609c765）。
+- 相关实体和关系必须从伴随文本中抽取。
+- 只保留与中心实体直接相连的边，形成星型图。
+使用中文作为输出语言。
+
+-步骤-
+1. 确定唯一的中心多模态实体，从伴随文本中识别所有与中心实体直接相关的文本实体。
+   对于中心实体，提取以下信息：
+    - entity_name：使用数据块的唯一标识符（如 image-c71ef797e99af81047fbc7509609c765）。
+    - entity_type：根据数据块类型（图像、表格、公式等）进行标注。
+    - entity_summary：简要描述数据块的内容和其在伴随文本中的作用。
+   对于从伴随文本中识别的每个实体，提取以下信息：
+    - entity_name：实体的名称，首字母大写
+    - entity_type：以下类型之一：[{entity_types}]
+    - entity_summary：实体的属性与活动的全面总结
+    将每个实体格式化为("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_summary>)
+
+2. 从步骤1中识别的实体中，识别所有（源实体，目标实体）对，这些实体彼此之间*明显相关*。
+   对于每对相关的实体，提取以下信息：
+   - source_entity：步骤1中识别的源实体名称
+   - target_entity：步骤1中识别的目标实体名称
+   - relationship_summary：解释为什么你认为源实体和目标实体彼此相关
+   将每个关系格式化为("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_summary>)
+
+3. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。
+
+4. 完成后，输出{completion_delimiter}
+
+################
+-示例-
+################
+多模态数据块类型：image
+多模态数据块唯一标识符：image-c71ef797e99af81047fbc7509609c765
+伴随文本：埃菲尔铁塔是法国巴黎的标志性结构，由古斯塔夫·埃菲尔设计并于1889年建成。它高324米，是世界上最高的建筑之一。埃菲尔铁塔位于塞纳河畔，吸引了数百万游客前来参观。它不仅是工程学的奇迹，也是法国文化的重要象征。
+################
+输出：
+("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"这是一张展示法国巴黎标志性建筑的图像，主体为埃菲尔铁塔，呈现其324米高度的全貌与河畔景观，具有工程与文化双重象征意义"){record_delimiter}
+("entity"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"landmark"{tuple_delimiter}"埃菲尔铁塔是法国巴黎的标志性结构，由古斯塔夫·埃菲尔设计并于1889年建成，高324米，是世界上最高的建筑之一，位于塞纳河畔，吸引了数百万游客前来参观"){record_delimiter}
+("entity"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"location"{tuple_delimiter}"法国巴黎是法国的首都，以其丰富的历史文化遗产和作为埃菲尔铁塔所在地而闻名"){record_delimiter}
+("entity"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"person"{tuple_delimiter}"古斯塔夫·埃菲尔是法国著名的工程师，设计并建造了埃菲尔铁塔"){record_delimiter}
+("entity"{tuple_delimiter}"塞纳河"{tuple_delimiter}"location"{tuple_delimiter}"塞纳河是流经法国巴黎的重要河流，埃菲尔铁塔位于其畔"){completion_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"图像展示了埃菲尔铁塔这一标志性建筑"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"图像背景为法国巴黎，突显了埃菲尔铁塔的地理位置"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"图像中的埃菲尔铁塔是由古斯塔夫·埃菲尔设计的"){record_delimiter}
+("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"塞纳河"{tuple_delimiter}"图像展示了埃菲尔铁塔位于塞纳河畔的景观"){completion_delimiter}
+################
+
+-真实数据-
+多模态数据块类型： {chunk_type}
+多模态数据块唯一标识符： {chunk_id}
+伴随文本： {chunk_text}
+################
+输出：
+"""
+
+
+MMKG_EXTRACTION_PROMPT: dict = {
+    "en": TEMPLATE_EN,
+    "zh": TEMPLATE_ZH,
+    "FORMAT": {
+        "tuple_delimiter": "<|>",
+        "record_delimiter": "##",
+        "completion_delimiter": "<|COMPLETE|>",
+        "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \
+science, technology, mission, gene",
+    },
+}
diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py
index 43b61906..eaf86762 100644
--- a/graphgen/utils/__init__.py
+++ b/graphgen/utils/__init__.py
@@ -9,7 +9,7 @@
     split_string_by_multi_markers,
     write_json,
 )
-from .hash import compute_args_hash, compute_content_hash
+from .hash import compute_args_hash, compute_content_hash, compute_mm_hash
 from .help_nltk import NLTKHelper
 from .log import logger, parse_log, set_logger
 from .loop import create_event_loop
diff --git a/graphgen/utils/detect_lang.py b/graphgen/utils/detect_lang.py
index c34ddac4..944ccb2a 100644
--- a/graphgen/utils/detect_lang.py
+++ b/graphgen/utils/detect_lang.py
@@ -1,40 +1,41 @@
 def detect_main_language(text):
     """
-    识别文本的主要语言
+    Detect the main language of the text, 'zh' for Chinese, 'en' for English
 
     :param text:
     :return:
     """
     assert isinstance(text, str)
+
     def is_chinese_char(char):
-        return '\u4e00' <= char <= '\u9fff'
+        return "\u4e00" <= char <= "\u9fff"
 
     def is_english_char(char):
         return char.isascii() and char.isalpha()
 
-    # 去除空格和标点符号
-    text = ''.join(char for char in text if char.strip())
+    text = "".join(char for char in text if char.strip())
 
     chinese_count = sum(1 for char in text if is_chinese_char(char))
     english_count = sum(1 for char in text if is_english_char(char))
 
     total = chinese_count + english_count
     if total == 0:
-        return 'en'
+        return "en"
 
     chinese_ratio = chinese_count / total
 
     if chinese_ratio >= 0.5:
-        return 'zh'
-    return 'en'
+        return "zh"
+    return "en"
+
 
 def detect_if_chinese(text):
     """
-    判断文本是否包含有中文
+    Detect if the text contains any Chinese characters
 
     :param text:
     :return:
     """
 
     assert isinstance(text, str)
-    return any('\u4e00' <= char <= '\u9fff' for char in text)
+    return any("\u4e00" <= char <= "\u9fff" for char in text)
diff --git a/graphgen/utils/hash.py b/graphgen/utils/hash.py
index bf93ec5f..59812e60 100644
--- a/graphgen/utils/hash.py
+++ b/graphgen/utils/hash.py
@@ -1,7 +1,23 @@
 from hashlib import md5
 
+
 def compute_args_hash(*args):
     return md5(str(args).encode()).hexdigest()
 
+
 def compute_content_hash(content, prefix: str = ""):
     return prefix + md5(content.encode()).hexdigest()
+
+
+def compute_mm_hash(item, prefix: str = ""):
+    if item.get("type") == "text" and item.get("text"):
+        content = item["text"].strip()
+    elif item.get("type") == "image" and item.get("img_path"):
+        content = f"image:{item['img_path']}"
+    elif item.get("type") == "table" and item.get("table_body"):
+        content = f"table:{item['table_body']}"
+    elif item.get("type") == "equation" and item.get("text"):
+        content = f"equation:{item['text']}"
+    else:
+        content = str(item)
+    return prefix + md5(content.encode()).hexdigest()
diff --git a/graphgen/utils/log.py b/graphgen/utils/log.py
index b4e0e475..102b7b23 100644
--- a/graphgen/utils/log.py
+++ b/graphgen/utils/log.py
@@ -8,7 +8,8 @@
 
 def set_logger(
     log_file: str,
-    log_level: int = logging.INFO,
+    file_level: int = logging.DEBUG,
+    console_level: int = logging.INFO,
     *,
     if_stream: bool = True,
     max_bytes: int = 50 * 1024 * 1024,  # 50 MB
@@ -22,14 +23,18 @@ def set_logger(
     if force:
         logger.handlers.clear()
 
-    logger.setLevel(log_level)
+    logger.setLevel(
+        min(file_level, console_level)
+    )  # Set to the lowest level to capture all logs
     logger.propagate = False
 
     if logger.handlers:
         logger.handlers.clear()
 
     if if_stream:
-        console = RichHandler(level=log_level, show_path=False, rich_tracebacks=True)
+        console = RichHandler(
+            level=console_level, show_path=False, rich_tracebacks=True
+        )
         console.setFormatter(logging.Formatter("%(message)s"))
         logger.addHandler(console)
 
@@ -39,7 +44,7 @@ def set_logger(
         backupCount=backup_count,
         encoding="utf-8",
     )
-    file_handler.setLevel(log_level)
+    file_handler.setLevel(file_level)
     file_handler.setFormatter(
         logging.Formatter(
             "[%(asctime)s] %(levelname)s [%(name)s:%(filename)s:%(lineno)d] %(message)s",
diff --git a/resources/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg b/resources/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg
new file mode 100644
index 00000000..2d9e1e8e
Binary files /dev/null and b/resources/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg differ
diff --git a/resources/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg b/resources/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg
new file mode 100644
index 00000000..d1872ed4
Binary files /dev/null and b/resources/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg differ
diff --git a/resources/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg b/resources/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg
new file mode 100644
index 00000000..66e790df
Binary files /dev/null and b/resources/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg differ
diff --git a/resources/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg b/resources/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg
new file mode 100644
index 00000000..755a0bd7
Binary files /dev/null and b/resources/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg differ
diff --git a/resources/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg b/resources/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg
new file mode 100644
index 00000000..59c133a6
Binary files /dev/null and b/resources/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg differ
diff --git a/resources/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg b/resources/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg
new file mode 100644
index 00000000..f2de8c08
Binary files /dev/null and b/resources/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg differ
diff --git a/resources/input_examples/vqa_demo.json b/resources/input_examples/vqa_demo.json
index a53101da..9d9661ec 100644
--- a/resources/input_examples/vqa_demo.json
+++ b/resources/input_examples/vqa_demo.json
@@ -1,6 +1,86 @@
 [
-  {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号，该品种外观特点为: 颖尖无色、无芒，谷壳黄色，落粒性适中，米粒大，有香味，食味品质好，高抗稻瘟病，适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"},
-  {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种，在长汀县圣丰家庭农场（河田镇南塘村）种植，土壤肥力中等、排灌方便[2]，试种面积 0.14 hm^2 ，作烟后稻种植，6 月15 日机播，7月5 日机插，10 月21 日成熟，产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社（濯田镇永巫村）和长汀县绿丰优质稻专业合作社（河田镇中街村）作烟后稻进一步扩大示范种植，均采用机播机插机收。2018 年示范面积 4.00 hm^2 ，平均产量 8.72 t/hm^2 ；2019 年示范面积 13.50 hm^2 ，平均产量 8.74 t/hm^2 。经3 a 试种、示范，隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点，可作为烟后稻在长汀县推广种植。"},
-  {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."},
-  {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."}
-]
+    {
+        "type": "text",
+        "content": "The $4 4 - \\mathbf { k D }$ protein, named harpin, was electroeluted from a preparative SDS-polyacrylamide gel (12). At concentrations ${ \\ge } 5 0 0 \\mathbf { n } \\mathbf { M }$ $( \\geq 2 5 ~ | \\mathbf { \\mu } \\mathbf { g } / \\mathbf { m l } )$ , harpin elicited HR in leaves of tobacco (Fig. 2, sectors 6 and "
+    },
+    {
+        "type": "text",
+        "content": "Because supernatants from E. amylovora Ea321(pCPP430) or E. coli DH5α (pCPP430) did not elicit HR, we postulated that harpin was not secreted but rather was present in or on the bacteria. Whole bacteria treated with protease failed to elicit HR, whereas bacteria incubated with protease together with $0 . 5 ~ \\mathrm { m M }$ phenylmethylsulfonyl fluoride (PMSF, a protease inhibitor) did (Table 1). Treatment of bacteria with increasing amounts of protease resulted in a decreased ability to elicit HR that correlated with the disappearance of harpin detectable in SDS-polyacrylamide gels (Table 1). After centrifugation of CFEP at $_ { 1 0 5 , 0 0 0 g }$ for 1 hour, most HR-eliciting activity was found in the supernatant. However, when the cell suspension was brought to $3 0 \\mathrm { \\ m M \\ M g C l } _ { 2 }$ ,before sonication, most activity was associated with the sedimented membrane fraction. Gel-permeation chromatography of unheated CFEP also indicated association of the elicitor with a high molecular weight $( > 1 0 ^ { 6 }$ daltons) fraction, probably membrane vesicles (14). Only the membrane fraction of E. amylovora Ea321(pCPP430) reacted with an antiserum raised in response to harpin (15), further supporting the cell-envelope location of harpin (Fig. 4). "
+    },
+    {
+        "type": "image",
+        "img_path": "resources/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg",
+        "image_caption": [
+            "Fig. 1. (A) Physical map of the hrp gene cluster of E. amylovora (4, 18, 29), showing restriction sites: B, Bam HI; E, Eco RI; H, Hind II. Gene hrpN, encoding harpin, is contained in the 1.3 kb Hind II fragment indicated by the solid bar. The shaded region (including hrpN) contains that part of the hrp gene cluster in which most transposon insertions, exemplified by K49, a Tn10 mini-kan (30) insertion, abolish the HR and pathogenicity phenotypes. Most "
+        ],
+        "image_footnote": []
+    },
+    {
+        "type": "text",
+        "content": "HR-eliciting activity, harpin was not detected. However, when the protease inhibitor PMSF $( 0 . 5 \\mathrm { \\ m M } )$ was included, the bacteria retained HR-eliciting activity and possessed detectable harpin for more than 2 hours. More protease was required per cell to destroy harpin produced by E. coli $\\mathsf { D H S } \\alpha ( \\mathsf { p C P P } 4 3 0 )$ than by Ea321(pCPP430), suggesting that E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ produces more harpin or degrades it more slowly, or both. "
+    },
+    {
+        "type": "text",
+        "content": "The ability of bacterial strains to elicit the HR in intact tobacco leaves is related genetically to their ability to elicit a $\\mathbf { K } ^ { + } / \\mathbf { H } ^ { + }$ exchange reaction (XR) in tobacco cell suspension cultures (TCSCs) (16); both reactions require the hrp gene cluster (17). "
+    },
+    {
+        "type": "image",
+        "img_path": "resources/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg",
+        "image_caption": [
+            "Fig. 2. Tobacco leaf showing responses 24 hours after infitration of sectors (7) with the following preparations: 1,， living E. coli DH5α (pCPP9) $( 1 \\times 1 0 ^ { 8 } / \\mathrm { m l } )$ ; 2, E. coli DH5α (pCPP430) $( 1 \\ \\times \\ 1 0 ^ { 8 } / \\mathrm { m l } )$ ; 3, E. coli DH5α (pCPP430K49) $( 1 \\times 1 0 ^ { 8 } / \\mathrm { m } )$ ; 4, E. amylovora Ea321 $( 1 \\times 1 0 ^ { 8 } / \\mathsf { m l } )$ ; 5, Ea321K49, an hrp mutant $( 1 \\times 1 0 ^ { 8 } / \\mathsf { m } )$ , 8, heat-treated CFEP from $\\pmb { \\varepsilon }$ coli ${ \\mathsf { D } } { \\mathsf { H } } { \\mathsf { S } } { \\mathsf {  { \\alpha } } } ( { \\mathsf { P } } { \\mathsf { C } } { \\mathsf { P } } { \\mathsf { P } } { \\mathsf { 9 } } )$ ; 9,heat-treated CFEP from E. coli DH5α(pCPP430); 10, heat-treated CFEP from E. coli DH5α(pCPP430K49); 11, heattreated CFEP from $\\boldsymbol { \\varepsilon }$ amylovora Ea321; 12, heat-treated CFEP from Ea321K49; 6, harpin $( 1 . 1 \\mu M )$ from E. coli DH5α(pCPP430) eluted from SDS-polyacrylamide gel; 7, same preparation as 6, but protease treated for 2 hours then heated for io min to inactivate protease; 13, harpin $( 1 \\pmb { \\mu } \\pmb { M } )$ from E. amylovora Ea321 eluted from SDS-polyacrylamide gel; 14, same preparation as 13 but with protease treatment as sample 7. Harpin solutions $< - 0 . 3 \\mu \\mathsf { m }$ do not cause collapse of infitrated tissue; spotty and incomplete collapse is caused by harpin between 0.3 and $0 . 5 ~ { \\mu } \\mathsf { m }$ . "
+        ],
+        "image_footnote": []
+    },
+    {
+        "type": "text",
+        "content": "We tested the ability of harpin to raise the pH of TCSC bathing solution, an indicator of the XR (Fig. 5). Cells of E. amylovora, grown in rich medium and added to TCSCs caused an increase in pH of the bathing solution after 2 to 3 hours. Addition of purified harpin caused an increase in pH within 1 hour. Erwinia amylovora mutant Ea321K49, which did not produce harpin in culture, and strains of E. coli containing mutated hrp gene clusters failed to elicit the XR. "
+    },
+    {
+        "type": "text",
+        "content": "Table 1. Protease sensitivity of the HR-eliciting activity of whole cells of E. amylovora Ea321(pCPP430). Cells were grown in LB medium, harvested'by centrifugation, and resuspended in 0.1 volume of $5 m M$ potassium phosphate $( \\mathsf { p H } \\thinspace 6 . 5 )$ containing tetracycline (40 $\\mu { \\sf g } / { \\sf m } 1 )$ . After incubation with protease (Sigma P5147), as indicated, at $\\mathfrak { s } 7 ^ { \\circ } \\mathfrak { C }$ for 5 min, $1 0 0 ~ \\mu !$ of each cell suspension was infiltrated into tobacco leaves. Leaf sector collapse was assayed at 24 hours. At the time of infiltration, portions of protease-treated cell mixtures were iysed, held'in boiling water for 10 min, centrifuged for 10 min at $1 2 . 0 0 0 g .$ and electrophoresed on a $10 \\%$ SDS-polyacrylamide gel to detect harpin. Electrophoresis was done for 2 hours at $1 5 m \\mathsf { A }$ followed by staining with Coomassie blue R-250. Cell-free supernatant, produced from the LB culture, was filter-sterilized and then concentrated with the Centriprep-10 (Amicon, Danvers, Massachusetts). "
+    },
+    {
+        "type": "table",
+        "img_path": "resources/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg",
+        "table_caption": [],
+        "table_footnote": [],
+        "table_body": "<table><tr><td>Protease per milliter</td><td>Tissue collapse</td><td>Harpin detected</td></tr><tr><td>0</td><td>+</td><td>+</td></tr><tr><td>5μg</td><td>+</td><td>+</td></tr><tr><td>10μg</td><td>+</td><td>+</td></tr><tr><td>20 μg</td><td>Weak</td><td>+</td></tr><tr><td>40 μg</td><td>-</td><td></td></tr><tr><td>80μg</td><td>一</td><td></td></tr><tr><td>80μg + 0.5 mM PMSF</td><td>+</td><td>+</td></tr><tr><td>Cell-free supernatant</td><td></td><td></td></tr></table>"
+    },
+    {
+        "type": "text",
+        "content": "expressed fom pCPP1084 in the T7RNA (20). Insertions of Tn5tac1 in hrpN (21) (Fig. 1) abolished the ability of E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ to elicit HR on tobacco or produce harpin detectable on Western blots. Ea321T5, a derivative of E. amylo"
+    },
+    {
+        "type": "image",
+        "img_path": "resources/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg",
+        "image_caption": [
+            "Fig. 3. SDS-polyacrylamide gel electrophoresis of CFEPs and purified harpin. Lanes: 1, purified harpin $( 1 . 5 \\ \\mathsf { \\pmb { \\mu } } \\mathsf { \\pmb { \\mathsf { g } } } )$ from E. coli $\\mathsf { D M } 5 \\alpha ( \\mathsf { p C P } 4 3 0 )$ incubated with protease (9) for 1 hour; 2, purified harpin $( 1 . 5 \\mu \\mathfrak { g } )$ from E. amylovora Ea321 incubated with protease for 1 hour; 3, same as 1, but without treatment with protease; 4, same as 2, but without treatment with protease; 5, CFEP (5 ${ \\pmb { \\mu } } ( { \\pmb q } )$ from E. coli DH5α(pCPP9) treated at $1 0 0 ^ { \\circ } \\mathbb { C }$ for 10'min; 6, CFEP $( 5 \\ \\pmb { \\mu } \\pmb { \\mu } )$ from E. coli DH5a(pCPP430K49) treated at $\\pmb { 1 0 0 } \\pmb { \\circ } \\pmb { \\subset }$ for 10 min; 7, CFEP $( 5 ~ \\mu 9 )$ from E. amylovora Ea321 treated "
+        ],
+        "image_footnote": []
+    },
+    {
+        "type": "text",
+        "content": "at $\\pmb { 1 0 0 ^ { \\circ } } \\pmb { \\mathbb { C } }$ for 10 min; 8, CFEP $( 5 ~ \\mu 9 )$ from E. coli DH5a(pCPP430) treated at $1 0 0 ^ { \\circ } \\mathsf { C }$ for 10 min; 9, CFEP $( 5 ~ \\mu 9 )$ from E. amylovora Ea321K49 treated at $_ { 1 0 0 ^ { \\circ } \\mathbb { C } }$ for 10 min. Samples from the preparations in lanes 3, 4, 7, and 8 elicited HR in tobacco leaves. Samples were prepared as described (8) and brought to 125 mM tris-HCI $( \\mathsf { p H } 6 . 8 )$ $4 \\%$ SDS, $20 \\%$ glycerol, boiled for 3 min, then electrophoresed through a $10 \\%$ (w/v) polyacrylamide gel with $0 . 1 \\%$ SDS at $1 5 m A$ for 2 hours in a Mighty Small apparatus according to instructions (Hoefer Scientific Instruments, San Francisco, California). The gel was stained with $0 . 0 2 5 \\%$ Coomassie Blue R-250. Low-range prestained molecular weight standards (Bio-Rad 161-0305) were used and calibrated with an unstained protein marker (Bio-Rad 161-0304). Arrow indicates region corresponding to $4 4 \\ k \\mathsf$ "
+    },
+    {
+        "type": "image",
+        "img_path": "resources/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg",
+        "image_caption": [
+            "Fig. 4. Subcellular location of elicitor protein. Logphase cells $( 1 . 5 m )$ of strain Ea321(pCPP430) were fractionated (31). Proteins from each fraction were electrophoresed and transferred to Immobilon-P membrane (Millipore, Bedford, Massachusetts). The Amplified Alkaline Phosphatase Immuno-Blot Assay Kit (170-6412, Bio-Rad Richmond, California) was ",
+            "used in a Western blot to detect the elicitor protein with an antiserum raised in rabbit in response to harpin (15). (A) Fractions in lanes: 1, periplasm; 2, membrane; 3, whole cells; 4, supernatant; 5, cytoplasm. (B) Harpin purified by high-performance liquid chromatography (19) hybridized with antiserum. Arrows indicates $4 4 \\ k \\mathsf { D }$ based on the molecular weight markers used in Fig. 3. (C) Normal serum control. CFEP from E. coli DH5a(pCPP430) hybridized with pre-immune serum. "
+        ],
+        "image_footnote": []
+    },
+    {
+        "type": "text",
+        "content": "DNA sequence data from the $1 . 3 – \\mathbf { k } \\mathbf { b }$ Hind II fragment revealed that hrpN is 1155 base pairs long, and it encodes a 385–amino acid protein (Fig. 1). The 15 $\\mathrm { N H } _ { 2 }$ -terminal residues revealed by amino acid sequencing corresponded to those deduced from the DNA' sequence (Fig. 1). The deduced amino acid sequence of harpin (Fig. 1), which corresponded closely with the analyzed amino acid composition, reveals a glycine-rich protein with a high degree of hydrophilicity. It appears to have an open structure, which may explain its heat stability and sensitivity to proteases. A FASTA search (23) of GenBank for similar proteins revealed similarity only with other glycine-rich proteins, such as several plant cell wall proteins and keratins. "
+    },
+    {
+        "type": "image",
+        "img_path": "resources/input_examples/images/eda01885ec54011f15e7a4a56bea0129a0475b2ab5b920a4cff20a4fb623517d.jpg",
+        "image_caption": [
+            "Fig. 5. Changes in pH of bathing solution of tobacco cell-suspension cultures (TCSC). Control values (no additive) were subtracted. Open squares, harpin (60 nM); open circles, cells of E. coli $\\mathsf { D H } 5 \\alpha ( \\mathsf { p C P P } 4 3 0 )$ $( 5 ~ \\times ~ 1 0 ^ { 7 }$ cells per milliliter); filled squares, cells of E. amylovora Ea321 $( 5 \\times 1 0 ^ { 7 }$ cells per milliiter); triangles, cells of E. coli DH5α(pCPP430K49) $( 5 \\times 1 0 ^ { 7 }$ cells per milliter); diamonds, cells of $\\boldsymbol { \\varepsilon }$ amylovora Ea321K49 $( 5 ~ \\times ~ 1 0 ^ { 7 }$ cells per  milliter); filled circles, cells of $\\boldsymbol { E } .$ coli DH5α(pCPP9) $( 5 \\times$ $\\pmb { 1 0 ^ { 6 } }$ cells per mililiter). TCSCs were shaken at room temperature with the indicated preparations. The pH was measured at the intervals indicated. All preparations that elicited HR in tobacco leaves (Fig. 2) also caused a pH increase in the TCSC medium. "
+        ],
+        "image_footnote": []
+    }
+]
\ No newline at end of file