From 7ceb0ac5bc977f787bfa829e89e512a970373e1c Mon Sep 17 00:00:00 2001 From: liz Date: Fri, 7 Mar 2025 19:17:09 +0800 Subject: [PATCH 1/3] fix bugs and optimize extraction prompt and parse_fn --- .../docs/examples/cookbooks/GraphRAG_v1.ipynb | 69 +++++++++++++++---- .../docs/examples/cookbooks/GraphRAG_v2.ipynb | 65 ++++++++++++++--- 2 files changed, 110 insertions(+), 24 deletions(-) diff --git a/docs/docs/examples/cookbooks/GraphRAG_v1.ipynb b/docs/docs/examples/cookbooks/GraphRAG_v1.ipynb index 2c04ae29b7640..d384c58183607 100644 --- a/docs/docs/examples/cookbooks/GraphRAG_v1.ipynb +++ b/docs/docs/examples/cookbooks/GraphRAG_v1.ipynb @@ -622,7 +622,7 @@ "\n", " metadata = node.metadata.copy()\n", " for triple in entities_relationship:\n", - " subj, rel, obj, description = triple\n", + " subj, obj, rel, description = triple\n", " subj_node = EntityNode(name=subj, properties=metadata)\n", " obj_node = EntityNode(name=obj, properties=metadata)\n", " metadata[\"relationship_description\"] = description\n", @@ -996,7 +996,6 @@ "- entity_name: Name of the entity, capitalized\n", "- entity_type: Type of the entity\n", "- entity_description: Comprehensive description of the entity's attributes and activities\n", - "Format each entity as (\"entity\"$$$$\"\"$$$$\"\"$$$$\"\")\n", "\n", "2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.\n", "For each pair of related entities, extract the following information:\n", @@ -1005,9 +1004,45 @@ "- relation: relationship between source_entity and target_entity\n", "- relationship_description: explanation as to why you think the source entity and the target entity are related to each other\n", "\n", - "Format each relationship as (\"relationship\"$$$$\"\"$$$$\"\"$$$$\"\"$$$$\"\")\n", - "\n", - "3. When finished, output.\n", + "3. Output Formatting:\n", + "- Return the result in valid JSON format with two keys: 'entities' (list of entity objects) and 'relationships' (list of relationship objects).\n", + "- Exclude any text outside the JSON structure (e.g., no explanations or comments).\n", + "- If no entities or relationships are identified, return empty lists: { \"entities\": [], \"relationships\": [] }.\n", + "\n", + "-An Output Example-\n", + "{\n", + " \"entities\": [\n", + " {\n", + " \"entity_name\": \"Albert Einstein\",\n", + " \"entity_type\": \"Person\",\n", + " \"entity_description\": \"Albert Einstein was a theoretical physicist who developed the theory of relativity and made significant contributions to physics.\"\n", + " },\n", + " {\n", + " \"entity_name\": \"Theory of Relativity\",\n", + " \"entity_type\": \"Scientific Theory\",\n", + " \"entity_description\": \"A scientific theory developed by Albert Einstein, describing the laws of physics in relation to observers in different frames of reference.\"\n", + " },\n", + " {\n", + " \"entity_name\": \"Nobel Prize in Physics\",\n", + " \"entity_type\": \"Award\",\n", + " \"entity_description\": \"A prestigious international award in the field of physics, awarded annually by the Royal Swedish Academy of Sciences.\"\n", + " }\n", + " ],\n", + " \"relationships\": [\n", + " {\n", + " \"source_entity\": \"Albert Einstein\",\n", + " \"target_entity\": \"Theory of Relativity\",\n", + " \"relation\": \"developed\",\n", + " \"relationship_description\": \"Albert Einstein is the developer of the theory of relativity.\"\n", + " },\n", + " {\n", + " \"source_entity\": \"Albert Einstein\",\n", + " \"target_entity\": \"Nobel Prize in Physics\",\n", + " \"relation\": \"won\",\n", + " \"relationship_description\": \"Albert Einstein won the Nobel Prize in Physics in 1921.\"\n", + " }\n", + " ]\n", + "}\n", "\n", "-Real Data-\n", "######################\n", @@ -1022,16 +1057,24 @@ "metadata": {}, "outputs": [], "source": [ - "entity_pattern = r'\\(\"entity\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\)'\n", - "relationship_pattern = r'\\(\"relationship\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\)'\n", - "\n", + "import json\n", "\n", "def parse_fn(response_str: str) -> Any:\n", - " entities = re.findall(entity_pattern, response_str)\n", - " relationships = re.findall(relationship_pattern, response_str)\n", - " return entities, relationships\n", - "\n", - "\n", + " json_pattern = r'\\{.*\\}'\n", + " match = re.search(json_pattern, response_str, re.DOTALL) \n", + " entities = []\n", + " relationships = []\n", + " if not match: return entities, relationships \n", + " json_str = match.group(0)\n", + " try:\n", + " data = json.loads(json_str)\n", + " entities = [(entity['entity_name'], entity['entity_type'], entity['entity_description']) for entity in data.get('entities', [])]\n", + " relationships = [(relation['source_entity'], relation['target_entity'], relation['relation'], relation['relationship_description']) for relation in data.get('relationships', [])]\n", + " return entities, relationships\n", + " except json.JSONDecodeError as e:\n", + " print(\"Error parsing JSON:\", e)\n", + " return entities, relationships\n", + " \n", "kg_extractor = GraphRAGExtractor(\n", " llm=llm,\n", " extract_prompt=KG_TRIPLET_EXTRACT_TMPL,\n", diff --git a/docs/docs/examples/cookbooks/GraphRAG_v2.ipynb b/docs/docs/examples/cookbooks/GraphRAG_v2.ipynb index 7e1f2e71809cb..da4d404a14df4 100644 --- a/docs/docs/examples/cookbooks/GraphRAG_v2.ipynb +++ b/docs/docs/examples/cookbooks/GraphRAG_v2.ipynb @@ -778,7 +778,6 @@ "- entity_name: Name of the entity, capitalized\n", "- entity_type: Type of the entity\n", "- entity_description: Comprehensive description of the entity's attributes and activities\n", - "Format each entity as (\"entity\"$$$$\"\"$$$$\"\"$$$$\"\")\n", "\n", "2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.\n", "For each pair of related entities, extract the following information:\n", @@ -787,9 +786,45 @@ "- relation: relationship between source_entity and target_entity\n", "- relationship_description: explanation as to why you think the source entity and the target entity are related to each other\n", "\n", - "Format each relationship as (\"relationship\"$$$$\"\"$$$$\"\"$$$$\"\"$$$$\"\")\n", - "\n", - "3. When finished, output.\n", + "3. Output Formatting:\n", + "- Return the result in valid JSON format with two keys: 'entities' (list of entity objects) and 'relationships' (list of relationship objects).\n", + "- Exclude any text outside the JSON structure (e.g., no explanations or comments).\n", + "- If no entities or relationships are identified, return empty lists: { \"entities\": [], \"relationships\": [] }.\n", + "\n", + "-An Output Example-\n", + "{\n", + " \"entities\": [\n", + " {\n", + " \"entity_name\": \"Albert Einstein\",\n", + " \"entity_type\": \"Person\",\n", + " \"entity_description\": \"Albert Einstein was a theoretical physicist who developed the theory of relativity and made significant contributions to physics.\"\n", + " },\n", + " {\n", + " \"entity_name\": \"Theory of Relativity\",\n", + " \"entity_type\": \"Scientific Theory\",\n", + " \"entity_description\": \"A scientific theory developed by Albert Einstein, describing the laws of physics in relation to observers in different frames of reference.\"\n", + " },\n", + " {\n", + " \"entity_name\": \"Nobel Prize in Physics\",\n", + " \"entity_type\": \"Award\",\n", + " \"entity_description\": \"A prestigious international award in the field of physics, awarded annually by the Royal Swedish Academy of Sciences.\"\n", + " }\n", + " ],\n", + " \"relationships\": [\n", + " {\n", + " \"source_entity\": \"Albert Einstein\",\n", + " \"target_entity\": \"Theory of Relativity\",\n", + " \"relation\": \"developed\",\n", + " \"relationship_description\": \"Albert Einstein is the developer of the theory of relativity.\"\n", + " },\n", + " {\n", + " \"source_entity\": \"Albert Einstein\",\n", + " \"target_entity\": \"Nobel Prize in Physics\",\n", + " \"relation\": \"won\",\n", + " \"relationship_description\": \"Albert Einstein won the Nobel Prize in Physics in 1921.\"\n", + " }\n", + " ]\n", + "}\n", "\n", "-Real Data-\n", "######################\n", @@ -804,15 +839,23 @@ "metadata": {}, "outputs": [], "source": [ - "entity_pattern = r'\\(\"entity\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\)'\n", - "relationship_pattern = r'\\(\"relationship\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\)'\n", - "\n", + "import json\n", "\n", "def parse_fn(response_str: str) -> Any:\n", - " entities = re.findall(entity_pattern, response_str)\n", - " relationships = re.findall(relationship_pattern, response_str)\n", - " return entities, relationships\n", - "\n", + " json_pattern = r'\\{.*\\}'\n", + " match = re.search(json_pattern, response_str, re.DOTALL) \n", + " entities = []\n", + " relationships = []\n", + " if not match: return entities, relationships \n", + " json_str = match.group(0)\n", + " try:\n", + " data = json.loads(json_str)\n", + " entities = [(entity['entity_name'], entity['entity_type'], entity['entity_description']) for entity in data.get('entities', [])]\n", + " relationships = [(relation['source_entity'], relation['target_entity'], relation['relation'], relation['relationship_description']) for relation in data.get('relationships', [])]\n", + " return entities, relationships\n", + " except json.JSONDecodeError as e:\n", + " print(\"Error parsing JSON:\", e)\n", + " return entities, relationships\n", "\n", "kg_extractor = GraphRAGExtractor(\n", " llm=llm,\n", From 67e95c2b71f3155d028207ba8336f056d010a647 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Sun, 9 Mar 2025 22:21:22 +0000 Subject: [PATCH 2/3] linting --- .../docs/examples/cookbooks/GraphRAG_v1.ipynb | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/docs/docs/examples/cookbooks/GraphRAG_v1.ipynb b/docs/docs/examples/cookbooks/GraphRAG_v1.ipynb index d384c58183607..83de929561982 100644 --- a/docs/docs/examples/cookbooks/GraphRAG_v1.ipynb +++ b/docs/docs/examples/cookbooks/GraphRAG_v1.ipynb @@ -1059,22 +1059,40 @@ "source": [ "import json\n", "\n", + "\n", "def parse_fn(response_str: str) -> Any:\n", - " json_pattern = r'\\{.*\\}'\n", - " match = re.search(json_pattern, response_str, re.DOTALL) \n", + " json_pattern = r\"\\{.*\\}\"\n", + " match = re.search(json_pattern, response_str, re.DOTALL)\n", " entities = []\n", " relationships = []\n", - " if not match: return entities, relationships \n", + " if not match:\n", + " return entities, relationships\n", " json_str = match.group(0)\n", " try:\n", " data = json.loads(json_str)\n", - " entities = [(entity['entity_name'], entity['entity_type'], entity['entity_description']) for entity in data.get('entities', [])]\n", - " relationships = [(relation['source_entity'], relation['target_entity'], relation['relation'], relation['relationship_description']) for relation in data.get('relationships', [])]\n", + " entities = [\n", + " (\n", + " entity[\"entity_name\"],\n", + " entity[\"entity_type\"],\n", + " entity[\"entity_description\"],\n", + " )\n", + " for entity in data.get(\"entities\", [])\n", + " ]\n", + " relationships = [\n", + " (\n", + " relation[\"source_entity\"],\n", + " relation[\"target_entity\"],\n", + " relation[\"relation\"],\n", + " relation[\"relationship_description\"],\n", + " )\n", + " for relation in data.get(\"relationships\", [])\n", + " ]\n", " return entities, relationships\n", " except json.JSONDecodeError as e:\n", " print(\"Error parsing JSON:\", e)\n", " return entities, relationships\n", - " \n", + "\n", + "\n", "kg_extractor = GraphRAGExtractor(\n", " llm=llm,\n", " extract_prompt=KG_TRIPLET_EXTRACT_TMPL,\n", From 02ba46c5fec3d27fc0b3f0c986a429cffa58bfec Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Sun, 9 Mar 2025 22:23:58 +0000 Subject: [PATCH 3/3] linting --- .../docs/examples/cookbooks/GraphRAG_v2.ipynb | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/docs/docs/examples/cookbooks/GraphRAG_v2.ipynb b/docs/docs/examples/cookbooks/GraphRAG_v2.ipynb index da4d404a14df4..8a3eb7622fd0e 100644 --- a/docs/docs/examples/cookbooks/GraphRAG_v2.ipynb +++ b/docs/docs/examples/cookbooks/GraphRAG_v2.ipynb @@ -841,22 +841,40 @@ "source": [ "import json\n", "\n", + "\n", "def parse_fn(response_str: str) -> Any:\n", - " json_pattern = r'\\{.*\\}'\n", - " match = re.search(json_pattern, response_str, re.DOTALL) \n", + " json_pattern = r\"\\{.*\\}\"\n", + " match = re.search(json_pattern, response_str, re.DOTALL)\n", " entities = []\n", " relationships = []\n", - " if not match: return entities, relationships \n", + " if not match:\n", + " return entities, relationships\n", " json_str = match.group(0)\n", " try:\n", " data = json.loads(json_str)\n", - " entities = [(entity['entity_name'], entity['entity_type'], entity['entity_description']) for entity in data.get('entities', [])]\n", - " relationships = [(relation['source_entity'], relation['target_entity'], relation['relation'], relation['relationship_description']) for relation in data.get('relationships', [])]\n", + " entities = [\n", + " (\n", + " entity[\"entity_name\"],\n", + " entity[\"entity_type\"],\n", + " entity[\"entity_description\"],\n", + " )\n", + " for entity in data.get(\"entities\", [])\n", + " ]\n", + " relationships = [\n", + " (\n", + " relation[\"source_entity\"],\n", + " relation[\"target_entity\"],\n", + " relation[\"relation\"],\n", + " relation[\"relationship_description\"],\n", + " )\n", + " for relation in data.get(\"relationships\", [])\n", + " ]\n", " return entities, relationships\n", " except json.JSONDecodeError as e:\n", " print(\"Error parsing JSON:\", e)\n", " return entities, relationships\n", "\n", + "\n", "kg_extractor = GraphRAGExtractor(\n", " llm=llm,\n", " extract_prompt=KG_TRIPLET_EXTRACT_TMPL,\n",