diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml index a40aa778..6510a91d 100644 --- a/graphgen/configs/aggregated_config.yaml +++ b/graphgen/configs/aggregated_config.yaml @@ -1,5 +1,5 @@ read: - input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples + input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples split: chunk_size: 1024 # chunk size for text splitting chunk_overlap: 100 # chunk overlap for text splitting @@ -18,5 +18,5 @@ partition: # graph partition configuration max_tokens_per_community: 10240 # max tokens per community unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss generate: - mode: aggregated # atomic, aggregated, multi_hop, cot + mode: aggregated # atomic, aggregated, multi_hop, cot, vqa data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml index d50ea421..ed1198a9 100644 --- a/graphgen/configs/atomic_config.yaml +++ b/graphgen/configs/atomic_config.yaml @@ -1,5 +1,5 @@ read: - input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples + input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples split: chunk_size: 1024 # chunk size for text splitting chunk_overlap: 100 # chunk overlap for text splitting @@ -15,5 +15,5 @@ partition: # graph partition configuration method_params: max_units_per_community: 1 # atomic partition, one node or edge per community generate: - mode: atomic # atomic, aggregated, multi_hop, cot + mode: atomic # atomic, aggregated, multi_hop, cot, vqa data_format: Alpaca # Alpaca, Sharegpt, ChatML diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml index 87dd3462..7873cbfb 100644 --- a/graphgen/configs/cot_config.yaml +++ b/graphgen/configs/cot_config.yaml @@ -1,5 +1,5 @@ read: - input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt. See resources/input_examples for examples + input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples split: chunk_size: 1024 # chunk size for text splitting chunk_overlap: 100 # chunk overlap for text splitting @@ -15,5 +15,5 @@ partition: # graph partition configuration use_lcc: false # whether to use the largest connected component random_seed: 42 # random seed for partitioning generate: - mode: cot # atomic, aggregated, multi_hop, cot + mode: cot # atomic, aggregated, multi_hop, cot, vqa data_format: Sharegpt # Alpaca, Sharegpt, ChatML diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml index 7d23048e..5862a058 100644 --- a/graphgen/configs/multi_hop_config.yaml +++ b/graphgen/configs/multi_hop_config.yaml @@ -1,5 +1,5 @@ read: - input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples + input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples split: chunk_size: 1024 # chunk size for text splitting chunk_overlap: 100 # chunk overlap for text splitting @@ -18,5 +18,5 @@ partition: # graph partition configuration max_tokens_per_community: 10240 # max tokens per community unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss generate: - mode: multi_hop # strategy for generating multi-hop QA pairs + mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/graphgen/configs/vqa_config.yaml b/graphgen/configs/vqa_config.yaml new file mode 100644 index 00000000..06ae04ff --- /dev/null +++ b/graphgen/configs/vqa_config.yaml @@ -0,0 +1,22 @@ +read: + input_file: resources/input_examples/pdf_demo.pdf # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples +split: + chunk_size: 1024 # chunk size for text splitting + chunk_overlap: 100 # chunk overlap for text splitting +search: # web search configuration + enabled: false # whether to enable web search + search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia +quiz_and_judge: # quiz and test whether the LLM masters the knowledge points + enabled: true + quiz_samples: 2 # number of quiz samples to generate + re_judge: false # whether to re-judge the existing quiz samples +partition: # graph partition configuration + method: ece # ece is a custom partition method based on comprehension loss + method_params: + max_units_per_community: 20 # max nodes and edges per community + min_units_per_community: 5 # min nodes and edges per community + max_tokens_per_community: 10240 # max tokens per community + unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss +generate: + mode: vqa # atomic, aggregated, multi_hop, cot, vqa + data_format: ChatML # Alpaca, Sharegpt, ChatML diff --git a/graphgen/generate.py b/graphgen/generate.py index e8509cb5..e14ee849 100644 --- a/graphgen/generate.py +++ b/graphgen/generate.py @@ -72,24 +72,11 @@ def main(): graph_gen.search(search_config=config["search"]) - # Use pipeline according to the output data type - if mode in ["atomic", "aggregated", "multi_hop"]: - logger.info("Generation mode set to '%s'. Start generation.", mode) - if "quiz_and_judge" in config and config["quiz_and_judge"]["enabled"]: - graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"]) - else: - logger.warning( - "Quiz and Judge strategy is disabled. Edge sampling falls back to random." - ) - assert ( - config["partition"]["method"] == "ece" - and "method_params" in config["partition"] - ), "Only ECE partition with edge sampling is supported." - config["partition"]["method_params"]["edge_sampling"] = "random" - elif mode == "cot": - logger.info("Generation mode set to 'cot'. Start generation.") - else: - raise ValueError(f"Unsupported output data type: {mode}") + if config.get("quiz_and_judge", {}).get("enabled"): + graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"]) + + # TODO: add data filtering step here in the future + # graph_gen.filter(filter_config=config["filter"]) graph_gen.generate( partition_config=config["partition"], diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py index 7336c5e3..3a95a274 100644 --- a/graphgen/graphgen.py +++ b/graphgen/graphgen.py @@ -91,7 +91,7 @@ async def insert(self, read_config: Dict, split_config: Dict): insert chunks into the graph """ # Step 1: Read files - data = read_files(read_config["input_file"]) + data = read_files(read_config["input_file"], self.working_dir) if len(data) == 0: logger.warning("No data to process") return @@ -105,6 +105,7 @@ async def insert(self, read_config: Dict, split_config: Dict): "content": doc["content"] } for doc in data + if doc.get("type", "text") == "text" } _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys())) new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} diff --git a/graphgen/models/__init__.py b/graphgen/models/__init__.py index d9869244..68944079 100644 --- a/graphgen/models/__init__.py +++ b/graphgen/models/__init__.py @@ -4,6 +4,7 @@ AtomicGenerator, CoTGenerator, MultiHopGenerator, + VQAGenerator, ) from .kg_builder import LightRAGKGBuilder from .llm.openai_client import OpenAIClient @@ -14,7 +15,7 @@ ECEPartitioner, LeidenPartitioner, ) -from .reader import CsvReader, JsonlReader, JsonReader, TxtReader +from .reader import CSVReader, JSONLReader, JSONReader, PDFReader, TXTReader from .search.db.uniprot_search import UniProtSearch from .search.kg.wiki_search import WikiSearch from .search.web.bing_search import BingSearch diff --git a/graphgen/models/generator/__init__.py b/graphgen/models/generator/__init__.py index dab300ee..4469c065 100644 --- a/graphgen/models/generator/__init__.py +++ b/graphgen/models/generator/__init__.py @@ -2,3 +2,4 @@ from .atomic_generator import AtomicGenerator from .cot_generator import CoTGenerator from .multi_hop_generator import MultiHopGenerator +from .vqa_generator import VQAGenerator diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py new file mode 100644 index 00000000..05d1867b --- /dev/null +++ b/graphgen/models/generator/vqa_generator.py @@ -0,0 +1,23 @@ +from dataclasses import dataclass +from typing import Any + +from graphgen.bases import BaseGenerator + + +@dataclass +class VQAGenerator(BaseGenerator): + @staticmethod + def build_prompt( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> str: + raise NotImplementedError( + "VQAGenerator.build_prompt is not implemented. " + "Please provide an implementation for VQA prompt construction." + ) + + @staticmethod + def parse_response(response: str) -> Any: + raise NotImplementedError( + "VQAGenerator.parse_response is not implemented. " + "Please provide an implementation for VQA response parsing." + ) diff --git a/graphgen/models/reader/__init__.py b/graphgen/models/reader/__init__.py index 0fca9032..45902f04 100644 --- a/graphgen/models/reader/__init__.py +++ b/graphgen/models/reader/__init__.py @@ -1,4 +1,5 @@ -from .csv_reader import CsvReader -from .json_reader import JsonReader -from .jsonl_reader import JsonlReader -from .txt_reader import TxtReader +from .csv_reader import CSVReader +from .json_reader import JSONReader +from .jsonl_reader import JSONLReader +from .pdf_reader import PDFReader +from .txt_reader import TXTReader diff --git a/graphgen/models/reader/csv_reader.py b/graphgen/models/reader/csv_reader.py index 05960082..555a245f 100644 --- a/graphgen/models/reader/csv_reader.py +++ b/graphgen/models/reader/csv_reader.py @@ -5,7 +5,7 @@ from graphgen.bases.base_reader import BaseReader -class CsvReader(BaseReader): +class CSVReader(BaseReader): def read(self, file_path: str) -> List[Dict[str, Any]]: df = pd.read_csv(file_path) diff --git a/graphgen/models/reader/json_reader.py b/graphgen/models/reader/json_reader.py index 98e1e16a..932dd01c 100644 --- a/graphgen/models/reader/json_reader.py +++ b/graphgen/models/reader/json_reader.py @@ -4,7 +4,7 @@ from graphgen.bases.base_reader import BaseReader -class JsonReader(BaseReader): +class JSONReader(BaseReader): def read(self, file_path: str) -> List[Dict[str, Any]]: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) diff --git a/graphgen/models/reader/jsonl_reader.py b/graphgen/models/reader/jsonl_reader.py index 8904bbb3..744ed39e 100644 --- a/graphgen/models/reader/jsonl_reader.py +++ b/graphgen/models/reader/jsonl_reader.py @@ -5,7 +5,7 @@ from graphgen.utils import logger -class JsonlReader(BaseReader): +class JSONLReader(BaseReader): def read(self, file_path: str) -> List[Dict[str, Any]]: docs = [] with open(file_path, "r", encoding="utf-8") as f: diff --git a/graphgen/models/reader/pdf_reader.py b/graphgen/models/reader/pdf_reader.py new file mode 100644 index 00000000..c8ca13c1 --- /dev/null +++ b/graphgen/models/reader/pdf_reader.py @@ -0,0 +1,235 @@ +import json +import os +import subprocess +import tempfile +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from graphgen.bases.base_reader import BaseReader +from graphgen.models.reader.txt_reader import TXTReader +from graphgen.utils import logger, pick_device + + +class PDFReader(BaseReader): + """ + PDF files are converted using MinerU, see [MinerU](https://github.com/opendatalab/MinerU). + After conversion, the resulting markdown file is parsed into text, images, tables, and formulas which can be used + for multi-modal graph generation. + """ + + def __init__( + self, + *, + output_dir: Optional[Union[str, Path]] = None, + method: str = "auto", # auto | txt | ocr + lang: Optional[str] = None, # ch / en / ja / ... + backend: Optional[ + str + ] = None, # pipeline | vlm-transformers | vlm-sglang-engine | vlm-sglang-client + device: Optional[str] = "auto", # cpu | cuda | cuda:0 | npu | mps | auto + source: Optional[str] = None, # huggingface | modelscope | local + vlm_url: Optional[str] = None, # 当 backend=vlm-sglang-client 时必填 + start_page: Optional[int] = None, # 0-based + end_page: Optional[int] = None, # 0-based, inclusive + formula: bool = True, + table: bool = True, + return_assets: bool = True, + **other_mineru_kwargs: Any, + ): + super().__init__() + self.output_dir = os.path.join(output_dir, "mineru") if output_dir else None + + if device == "auto": + device = pick_device() + + self._default_kwargs: Dict[str, Any] = { + "method": method, + "lang": lang, + "backend": backend, + "device": device, + "source": source, + "vlm_url": vlm_url, + "start_page": start_page, + "end_page": end_page, + "formula": formula, + "table": table, + **other_mineru_kwargs, + } + self._default_kwargs = { + k: v for k, v in self._default_kwargs.items() if v is not None + } + self.return_assets = return_assets + self.parser = MinerUParser() + self.txt_reader = TXTReader() + + def read(self, file_path: str, **override) -> List[Dict[str, Any]]: + """ + file_path + **override: override MinerU parameters + """ + pdf_path = Path(file_path).expanduser().resolve() + if not pdf_path.is_file(): + raise FileNotFoundError(pdf_path) + + kwargs = {**self._default_kwargs, **override} + + mineru_result = self._call_mineru(pdf_path, kwargs) + return mineru_result + + def _call_mineru( + self, pdf_path: Path, kwargs: Dict[str, Any] + ) -> List[Dict[str, Any]]: + output_dir: Optional[str] = None + if self.output_dir: + output_dir = str(self.output_dir) + + return self.parser.parse_pdf(pdf_path, output_dir=output_dir, **kwargs) + + def _locate_md(self, pdf_path: Path, kwargs: Dict[str, Any]) -> Optional[Path]: + out_dir = ( + Path(self.output_dir) if self.output_dir else Path(tempfile.gettempdir()) + ) + method = kwargs.get("method", "auto") + backend = kwargs.get("backend", "") + if backend.startswith("vlm-"): + method = "vlm" + + candidate = Path( + os.path.join(out_dir, pdf_path.stem, method, f"{pdf_path.stem}.md") + ) + if candidate.exists(): + return candidate + candidate = Path(os.path.join(out_dir, f"{pdf_path.stem}.md")) + if candidate.exists(): + return candidate + return None + + +class MinerUParser: + def __init__(self) -> None: + self._check_bin() + + @staticmethod + def parse_pdf( + pdf_path: Union[str, Path], + output_dir: Optional[Union[str, Path]] = None, + method: str = "auto", + device: str = "cpu", + **kw: Any, + ) -> List[Dict[str, Any]]: + pdf = Path(pdf_path).expanduser().resolve() + if not pdf.is_file(): + raise FileNotFoundError(pdf) + + out = ( + Path(output_dir) if output_dir else Path(tempfile.mkdtemp(prefix="mineru_")) + ) + out.mkdir(parents=True, exist_ok=True) + + cached = MinerUParser._try_load_cached_result(str(out), pdf.stem, method) + if cached is not None: + return cached + + MinerUParser._run_mineru(pdf, out, method, device, **kw) + + cached = MinerUParser._try_load_cached_result(str(out), pdf.stem, method) + return cached if cached is not None else [] + + @staticmethod + def _try_load_cached_result( + out_dir: str, pdf_stem: str, method: str + ) -> Optional[List[Dict[str, Any]]]: + """ + try to load cached json result from MinerU output. + :param out_dir: + :param pdf_stem: + :param method: + :return: + """ + json_file = os.path.join( + out_dir, pdf_stem, method, f"{pdf_stem}_content_list.json" + ) + if not os.path.exists(json_file): + return None + + try: + with open(json_file, encoding="utf-8") as f: + data = json.load(f) + except Exception as exc: # pylint: disable=broad-except + logger.warning("Failed to load cached MinerU result: %s", exc) + return None + + base = os.path.dirname(json_file) + results = [] + for item in data: + for key in ("img_path", "table_img_path", "equation_img_path"): + rel_path = item.get(key) + if rel_path: + item[key] = str(Path(base).joinpath(rel_path).resolve()) + if item["type"] == "text": + item["content"] = item["text"] + del item["text"] + for key in ("page_idx", "bbox", "text_level"): + if item.get(key) is not None: + del item[key] + if item["type"] == "text" and not item["content"].strip(): + continue + results.append(item) + return results + + @staticmethod + def _run_mineru( + pdf: Path, + out: Path, + method: str, + device: str, + **kw: Any, + ) -> None: + cmd = [ + "mineru", + "-p", + str(pdf), + "-o", + str(out), + "-m", + method, + "-d", + device, + ] + for k, v in kw.items(): + if v is None: + continue + if isinstance(v, bool): + cmd += [f"--{k}", str(v).lower()] + else: + cmd += [f"--{k}", str(v)] + + logger.info("Parsing PDF with MinerU: %s", pdf) + logger.debug("Running MinerU command: %s", " ".join(cmd)) + + proc = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="ignore", + check=False, + ) + if proc.returncode != 0: + raise RuntimeError(f"MinerU failed: {proc.stderr or proc.stdout}") + + @staticmethod + def _check_bin() -> None: + try: + subprocess.run( + ["mineru", "--version"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + except (subprocess.CalledProcessError, FileNotFoundError) as exc: + raise RuntimeError( + "MinerU is not installed or not found in PATH. Please install it from pip: \n" + "pip install -U 'mineru[core]'" + ) from exc diff --git a/graphgen/models/reader/txt_reader.py b/graphgen/models/reader/txt_reader.py index f9419ebd..5d30703b 100644 --- a/graphgen/models/reader/txt_reader.py +++ b/graphgen/models/reader/txt_reader.py @@ -3,7 +3,7 @@ from graphgen.bases.base_reader import BaseReader -class TxtReader(BaseReader): +class TXTReader(BaseReader): def read(self, file_path: str) -> List[Dict[str, Any]]: docs = [] with open(file_path, "r", encoding="utf-8") as f: diff --git a/graphgen/operators/generate/generate_qas.py b/graphgen/operators/generate/generate_qas.py index 51ba22ad..feadafc1 100644 --- a/graphgen/operators/generate/generate_qas.py +++ b/graphgen/operators/generate/generate_qas.py @@ -6,6 +6,7 @@ AtomicGenerator, CoTGenerator, MultiHopGenerator, + VQAGenerator, ) from graphgen.utils import logger, run_concurrent @@ -39,6 +40,8 @@ async def generate_qas( generator = MultiHopGenerator(llm_client) elif mode == "cot": generator = CoTGenerator(llm_client) + elif mode == "vqa": + generator = VQAGenerator(llm_client) else: raise ValueError(f"Unsupported generation mode: {mode}") diff --git a/graphgen/operators/read/read_files.py b/graphgen/operators/read/read_files.py index e1d13a2b..47f6ee21 100644 --- a/graphgen/operators/read/read_files.py +++ b/graphgen/operators/read/read_files.py @@ -1,16 +1,22 @@ -from graphgen.models import CsvReader, JsonlReader, JsonReader, TxtReader +from graphgen.models import CSVReader, JSONLReader, JSONReader, PDFReader, TXTReader _MAPPING = { - "jsonl": JsonlReader, - "json": JsonReader, - "txt": TxtReader, - "csv": CsvReader, + "jsonl": JSONLReader, + "json": JSONReader, + "txt": TXTReader, + "csv": CSVReader, + "pdf": PDFReader, } -def read_files(file_path: str): - suffix = file_path.split(".")[-1] - if suffix in _MAPPING: +def read_files(file_path: str, cache_dir: str | None = None) -> list[dict]: + suffix = file_path.split(".")[-1].lower() + if suffix == "pdf": + if cache_dir is not None: + reader = _MAPPING[suffix](output_dir=cache_dir) + else: + reader = _MAPPING[suffix]() + elif suffix in _MAPPING: reader = _MAPPING[suffix]() else: raise ValueError( diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py index 3d80d2df..43b61906 100644 --- a/graphgen/utils/__init__.py +++ b/graphgen/utils/__init__.py @@ -1,5 +1,6 @@ from .calculate_confidence import yes_no_loss_entropy from .detect_lang import detect_if_chinese, detect_main_language +from .device import pick_device from .format import ( handle_single_entity_extraction, handle_single_relationship_extraction, diff --git a/graphgen/utils/device.py b/graphgen/utils/device.py new file mode 100644 index 00000000..1e5d8642 --- /dev/null +++ b/graphgen/utils/device.py @@ -0,0 +1,44 @@ +import shutil +import subprocess +import sys + + +def pick_device() -> str: + """Return the best available device string for MinerU.""" + # 1. NVIDIA GPU + if shutil.which("nvidia-smi") is not None: + try: + # check if there's any free GPU memory + out = subprocess.check_output( + [ + "nvidia-smi", + "--query-gpu=memory.free", + "--format=csv,noheader,nounits", + ], + text=True, + ) + if any(int(line) > 0 for line in out.strip().splitlines()): + return "cuda:0" + except Exception: # pylint: disable=broad-except + pass + + # 2. Apple Silicon + if sys.platform == "darwin" and shutil.which("sysctl"): + try: + brand = subprocess.check_output( + ["sysctl", "-n", "machdep.cpu.brand_string"], text=True + ) + if "Apple" in brand: + return "mps" + except Exception: # pylint: disable=broad-except + pass + + # 3. Ascend NPU + if shutil.which("npu-smi") is not None: + try: + subprocess.check_call(["npu-smi", "info"], stdout=subprocess.DEVNULL) + return "npu" + except Exception: # pylint: disable=broad-except + pass + + return "cpu" diff --git a/resources/input_examples/csv_demo.csv b/resources/input_examples/csv_demo.csv index 11e6dde3..e6c1b521 100644 --- a/resources/input_examples/csv_demo.csv +++ b/resources/input_examples/csv_demo.csv @@ -1,5 +1,5 @@ -content -"云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。" -"隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。" -"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity." -"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture." +type,content +text,云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。 +text,隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。 +text,"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity." +text,"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture." diff --git a/resources/input_examples/json_demo.json b/resources/input_examples/json_demo.json index b496c16f..a53101da 100644 --- a/resources/input_examples/json_demo.json +++ b/resources/input_examples/json_demo.json @@ -1,6 +1,6 @@ [ - {"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, - {"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, - {"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, - {"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} + {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, + {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, + {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, + {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} ] diff --git a/resources/input_examples/jsonl_demo.jsonl b/resources/input_examples/jsonl_demo.jsonl index 024559a6..2b743578 100644 --- a/resources/input_examples/jsonl_demo.jsonl +++ b/resources/input_examples/jsonl_demo.jsonl @@ -1,4 +1,4 @@ -{"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} -{"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} -{"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} -{"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} +{"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} +{"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} +{"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} +{"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} diff --git a/resources/input_examples/pdf_demo.pdf b/resources/input_examples/pdf_demo.pdf new file mode 100644 index 00000000..5a328e4c Binary files /dev/null and b/resources/input_examples/pdf_demo.pdf differ diff --git a/resources/input_examples/vqa_demo.json b/resources/input_examples/vqa_demo.json new file mode 100644 index 00000000..a53101da --- /dev/null +++ b/resources/input_examples/vqa_demo.json @@ -0,0 +1,6 @@ +[ + {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, + {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, + {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, + {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} +] diff --git a/scripts/generate/generate_vqa.sh b/scripts/generate/generate_vqa.sh new file mode 100644 index 00000000..91c4aa1e --- /dev/null +++ b/scripts/generate/generate_vqa.sh @@ -0,0 +1,3 @@ +python3 -m graphgen.generate \ +--config_file graphgen/configs/vqa_config.yaml \ +--output_dir cache/ diff --git a/tests/integration_tests/models/reader/test_mineru_parser.py b/tests/integration_tests/models/reader/test_mineru_parser.py new file mode 100644 index 00000000..befafa90 --- /dev/null +++ b/tests/integration_tests/models/reader/test_mineru_parser.py @@ -0,0 +1,41 @@ +import os +from pathlib import Path + +from graphgen.models.reader.pdf_reader import MinerUParser + + +def test_check_bin(): + """Ensure mineru CLI is available.""" + MinerUParser() + + +def test_parse_pdf(): + """Parse a real PDF and verify basic structure.""" + repo_root = Path(__file__).resolve().parents[4] + + sample_pdf = os.path.join(repo_root, "resources", "input_examples", "pdf_demo.pdf") + parser = MinerUParser() + blocks = parser.parse_pdf(sample_pdf, device="cpu", method="auto") + + assert isinstance(blocks, list) + assert blocks, "At least one block expected" + + text_blocks = [b for b in blocks if b.get("type") == "text"] + assert text_blocks, "No text block found" + + first = text_blocks[0] + assert "text" in first + assert isinstance(first["text"], str) + assert first["text"].strip(), "Empty text content" + + +def test_empty_pdf(tmp_path: Path) -> None: + """Gracefully handle blank PDF.""" + empty = tmp_path / "empty.pdf" + empty.write_bytes(b"%PDF-1.4\n%%EOF\n") # syntactically valid, no content + + parser = MinerUParser() + blocks = parser.parse_pdf(empty, device="cpu") + + # Empty list or list with empty text block are both acceptable + assert isinstance(blocks, list) diff --git a/webui/examples/csv_demo.csv b/webui/examples/csv_demo.csv index 11e6dde3..e6c1b521 100644 --- a/webui/examples/csv_demo.csv +++ b/webui/examples/csv_demo.csv @@ -1,5 +1,5 @@ -content -"云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。" -"隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。" -"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity." -"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture." +type,content +text,云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。 +text,隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。 +text,"Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity." +text,"Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture." diff --git a/webui/examples/json_demo.json b/webui/examples/json_demo.json index b496c16f..a53101da 100644 --- a/webui/examples/json_demo.json +++ b/webui/examples/json_demo.json @@ -1,6 +1,6 @@ [ - {"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, - {"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, - {"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, - {"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} + {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, + {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, + {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, + {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} ] diff --git a/webui/examples/jsonl_demo.jsonl b/webui/examples/jsonl_demo.jsonl index 024559a6..2b743578 100644 --- a/webui/examples/jsonl_demo.jsonl +++ b/webui/examples/jsonl_demo.jsonl @@ -1,4 +1,4 @@ -{"content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} -{"content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} -{"content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} -{"content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} +{"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"} +{"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"} +{"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."} +{"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} diff --git a/webui/examples/pdf_demo.pdf b/webui/examples/pdf_demo.pdf new file mode 100644 index 00000000..5a328e4c Binary files /dev/null and b/webui/examples/pdf_demo.pdf differ diff --git a/webui/examples/vqa_demo.json b/webui/examples/vqa_demo.json new file mode 100644 index 00000000..a53101da --- /dev/null +++ b/webui/examples/vqa_demo.json @@ -0,0 +1,6 @@ +[ + {"type": "text", "content": "云南省农业科学院粮食作物研究所于2005年育成早熟品种云粳26号,该品种外观特点为: 颖尖无色、无芒,谷壳黄色,落粒性适中,米粒大,有香味,食味品质好,高抗稻瘟病,适宜在云南中海拔 1 500∼1 800 m 稻区种植。2012年被农业部列为西南稻区农业推广主导品种。"}, + {"type": "text", "content": "隆两优1212 于2017 年引入福建省龙岩市长汀县试种,在长汀县圣丰家庭农场(河田镇南塘村)种植,土壤肥力中等、排灌方便[2],试种面积 0.14 hm^2 ,作烟后稻种植,6 月15 日机播,7月5 日机插,10 月21 日成熟,产量 8.78 t/hm^2 。2018 和2019 年分别在长汀润丰优质稻专业合作社(濯田镇永巫村)和长汀县绿丰优质稻专业合作社(河田镇中街村)作烟后稻进一步扩大示范种植,均采用机播机插机收。2018 年示范面积 4.00 hm^2 ,平均产量 8.72 t/hm^2 ;2019 年示范面积 13.50 hm^2 ,平均产量 8.74 t/hm^2 。经3 a 试种、示范,隆两优1212 表现出分蘖力强、抗性好、抽穗整齐、后期转色好、生育期适中、产量高、适应性好等特点,可作为烟后稻在长汀县推广种植。"}, + {"type": "text", "content": "Grain size is one of the key factors determining grain yield. However, it remains largely unknown how grain size is regulated by developmental signals. Here, we report the identification and characterization of a dominant mutant big grain1 (Bg1-D) that shows an extra-large grain phenotype from our rice T-DNA insertion population. Overexpression of BG1 leads to significantly increased grain size, and the severe lines exhibit obviously perturbed gravitropism. In addition, the mutant has increased sensitivities to both auxin and N-1-naphthylphthalamic acid, an auxin transport inhibitor, whereas knockdown of BG1 results in decreased sensitivities and smaller grains. Moreover, BG1 is specifically induced by auxin treatment, preferentially expresses in the vascular tissue of culms and young panicles, and encodes a novel membrane-localized protein, strongly suggesting its role in regulating auxin transport. Consistent with this finding, the mutant has increased auxin basipetal transport and altered auxin distribution, whereas the knockdown plants have decreased auxin transport. Manipulation of BG1 in both rice and Arabidopsis can enhance plant biomass, seed weight, and yield. Taking these data together, we identify a novel positive regulator of auxin response and transport in a crop plant and demonstrate its role in regulating grain size, thus illuminating a new strategy to improve plant productivity."}, + {"type": "text", "content": "Tiller angle, an important component of plant architecture, greatly influences the grain yield of rice (Oryza sativa L.). Here, we identified Tiller Angle Control 4 (TAC4) as a novel regulator of rice tiller angle. TAC4 encodes a plant-specific, highly conserved nuclear protein. The loss of TAC4 function leads to a significant increase in the tiller angle. TAC4 can regulate rice shoot\n\ngravitropism by increasing the indole acetic acid content and affecting the auxin distribution. A sequence analysis revealed that TAC4 has undergone a bottleneck and become fixed in indica cultivars during domestication and improvement. Our findings facilitate an increased understanding of the regulatory mechanisms of tiller angle and also provide a potential gene resource for the improvement of rice plant architecture."} +]