Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix bugs and optimize extraction prompt and parse_fn #18046

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 71 additions & 10 deletions docs/docs/examples/cookbooks/GraphRAG_v1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@
"\n",
" metadata = node.metadata.copy()\n",
" for triple in entities_relationship:\n",
" subj, rel, obj, description = triple\n",
" subj, obj, rel, description = triple\n",
" subj_node = EntityNode(name=subj, properties=metadata)\n",
" obj_node = EntityNode(name=obj, properties=metadata)\n",
" metadata[\"relationship_description\"] = description\n",
Expand Down Expand Up @@ -996,7 +996,6 @@
"- entity_name: Name of the entity, capitalized\n",
"- entity_type: Type of the entity\n",
"- entity_description: Comprehensive description of the entity's attributes and activities\n",
"Format each entity as (\"entity\"$$$$\"<entity_name>\"$$$$\"<entity_type>\"$$$$\"<entity_description>\")\n",
"\n",
"2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.\n",
"For each pair of related entities, extract the following information:\n",
Expand All @@ -1005,9 +1004,45 @@
"- relation: relationship between source_entity and target_entity\n",
"- relationship_description: explanation as to why you think the source entity and the target entity are related to each other\n",
"\n",
"Format each relationship as (\"relationship\"$$$$\"<source_entity>\"$$$$\"<target_entity>\"$$$$\"<relation>\"$$$$\"<relationship_description>\")\n",
"\n",
"3. When finished, output.\n",
"3. Output Formatting:\n",
"- Return the result in valid JSON format with two keys: 'entities' (list of entity objects) and 'relationships' (list of relationship objects).\n",
"- Exclude any text outside the JSON structure (e.g., no explanations or comments).\n",
"- If no entities or relationships are identified, return empty lists: { \"entities\": [], \"relationships\": [] }.\n",
"\n",
"-An Output Example-\n",
"{\n",
" \"entities\": [\n",
" {\n",
" \"entity_name\": \"Albert Einstein\",\n",
" \"entity_type\": \"Person\",\n",
" \"entity_description\": \"Albert Einstein was a theoretical physicist who developed the theory of relativity and made significant contributions to physics.\"\n",
" },\n",
" {\n",
" \"entity_name\": \"Theory of Relativity\",\n",
" \"entity_type\": \"Scientific Theory\",\n",
" \"entity_description\": \"A scientific theory developed by Albert Einstein, describing the laws of physics in relation to observers in different frames of reference.\"\n",
" },\n",
" {\n",
" \"entity_name\": \"Nobel Prize in Physics\",\n",
" \"entity_type\": \"Award\",\n",
" \"entity_description\": \"A prestigious international award in the field of physics, awarded annually by the Royal Swedish Academy of Sciences.\"\n",
" }\n",
" ],\n",
" \"relationships\": [\n",
" {\n",
" \"source_entity\": \"Albert Einstein\",\n",
" \"target_entity\": \"Theory of Relativity\",\n",
" \"relation\": \"developed\",\n",
" \"relationship_description\": \"Albert Einstein is the developer of the theory of relativity.\"\n",
" },\n",
" {\n",
" \"source_entity\": \"Albert Einstein\",\n",
" \"target_entity\": \"Nobel Prize in Physics\",\n",
" \"relation\": \"won\",\n",
" \"relationship_description\": \"Albert Einstein won the Nobel Prize in Physics in 1921.\"\n",
" }\n",
" ]\n",
"}\n",
"\n",
"-Real Data-\n",
"######################\n",
Expand All @@ -1022,14 +1057,40 @@
"metadata": {},
"outputs": [],
"source": [
"entity_pattern = r'\\(\"entity\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\)'\n",
"relationship_pattern = r'\\(\"relationship\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\)'\n",
"import json\n",
"\n",
"\n",
"def parse_fn(response_str: str) -> Any:\n",
" entities = re.findall(entity_pattern, response_str)\n",
" relationships = re.findall(relationship_pattern, response_str)\n",
" return entities, relationships\n",
" json_pattern = r\"\\{.*\\}\"\n",
" match = re.search(json_pattern, response_str, re.DOTALL)\n",
" entities = []\n",
" relationships = []\n",
" if not match:\n",
" return entities, relationships\n",
" json_str = match.group(0)\n",
" try:\n",
" data = json.loads(json_str)\n",
" entities = [\n",
" (\n",
" entity[\"entity_name\"],\n",
" entity[\"entity_type\"],\n",
" entity[\"entity_description\"],\n",
" )\n",
" for entity in data.get(\"entities\", [])\n",
" ]\n",
" relationships = [\n",
" (\n",
" relation[\"source_entity\"],\n",
" relation[\"target_entity\"],\n",
" relation[\"relation\"],\n",
" relation[\"relationship_description\"],\n",
" )\n",
" for relation in data.get(\"relationships\", [])\n",
" ]\n",
" return entities, relationships\n",
" except json.JSONDecodeError as e:\n",
" print(\"Error parsing JSON:\", e)\n",
" return entities, relationships\n",
"\n",
"\n",
"kg_extractor = GraphRAGExtractor(\n",
Expand Down
79 changes: 70 additions & 9 deletions docs/docs/examples/cookbooks/GraphRAG_v2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,6 @@
"- entity_name: Name of the entity, capitalized\n",
"- entity_type: Type of the entity\n",
"- entity_description: Comprehensive description of the entity's attributes and activities\n",
"Format each entity as (\"entity\"$$$$\"<entity_name>\"$$$$\"<entity_type>\"$$$$\"<entity_description>\")\n",
"\n",
"2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.\n",
"For each pair of related entities, extract the following information:\n",
Expand All @@ -787,9 +786,45 @@
"- relation: relationship between source_entity and target_entity\n",
"- relationship_description: explanation as to why you think the source entity and the target entity are related to each other\n",
"\n",
"Format each relationship as (\"relationship\"$$$$\"<source_entity>\"$$$$\"<target_entity>\"$$$$\"<relation>\"$$$$\"<relationship_description>\")\n",
"\n",
"3. When finished, output.\n",
"3. Output Formatting:\n",
"- Return the result in valid JSON format with two keys: 'entities' (list of entity objects) and 'relationships' (list of relationship objects).\n",
"- Exclude any text outside the JSON structure (e.g., no explanations or comments).\n",
"- If no entities or relationships are identified, return empty lists: { \"entities\": [], \"relationships\": [] }.\n",
"\n",
"-An Output Example-\n",
"{\n",
" \"entities\": [\n",
" {\n",
" \"entity_name\": \"Albert Einstein\",\n",
" \"entity_type\": \"Person\",\n",
" \"entity_description\": \"Albert Einstein was a theoretical physicist who developed the theory of relativity and made significant contributions to physics.\"\n",
" },\n",
" {\n",
" \"entity_name\": \"Theory of Relativity\",\n",
" \"entity_type\": \"Scientific Theory\",\n",
" \"entity_description\": \"A scientific theory developed by Albert Einstein, describing the laws of physics in relation to observers in different frames of reference.\"\n",
" },\n",
" {\n",
" \"entity_name\": \"Nobel Prize in Physics\",\n",
" \"entity_type\": \"Award\",\n",
" \"entity_description\": \"A prestigious international award in the field of physics, awarded annually by the Royal Swedish Academy of Sciences.\"\n",
" }\n",
" ],\n",
" \"relationships\": [\n",
" {\n",
" \"source_entity\": \"Albert Einstein\",\n",
" \"target_entity\": \"Theory of Relativity\",\n",
" \"relation\": \"developed\",\n",
" \"relationship_description\": \"Albert Einstein is the developer of the theory of relativity.\"\n",
" },\n",
" {\n",
" \"source_entity\": \"Albert Einstein\",\n",
" \"target_entity\": \"Nobel Prize in Physics\",\n",
" \"relation\": \"won\",\n",
" \"relationship_description\": \"Albert Einstein won the Nobel Prize in Physics in 1921.\"\n",
" }\n",
" ]\n",
"}\n",
"\n",
"-Real Data-\n",
"######################\n",
Expand All @@ -804,14 +839,40 @@
"metadata": {},
"outputs": [],
"source": [
"entity_pattern = r'\\(\"entity\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\)'\n",
"relationship_pattern = r'\\(\"relationship\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\$\\$\\$\\$\"(.+?)\"\\)'\n",
"import json\n",
"\n",
"\n",
"def parse_fn(response_str: str) -> Any:\n",
" entities = re.findall(entity_pattern, response_str)\n",
" relationships = re.findall(relationship_pattern, response_str)\n",
" return entities, relationships\n",
" json_pattern = r\"\\{.*\\}\"\n",
" match = re.search(json_pattern, response_str, re.DOTALL)\n",
" entities = []\n",
" relationships = []\n",
" if not match:\n",
" return entities, relationships\n",
" json_str = match.group(0)\n",
" try:\n",
" data = json.loads(json_str)\n",
" entities = [\n",
" (\n",
" entity[\"entity_name\"],\n",
" entity[\"entity_type\"],\n",
" entity[\"entity_description\"],\n",
" )\n",
" for entity in data.get(\"entities\", [])\n",
" ]\n",
" relationships = [\n",
" (\n",
" relation[\"source_entity\"],\n",
" relation[\"target_entity\"],\n",
" relation[\"relation\"],\n",
" relation[\"relationship_description\"],\n",
" )\n",
" for relation in data.get(\"relationships\", [])\n",
" ]\n",
" return entities, relationships\n",
" except json.JSONDecodeError as e:\n",
" print(\"Error parsing JSON:\", e)\n",
" return entities, relationships\n",
"\n",
"\n",
"kg_extractor = GraphRAGExtractor(\n",
Expand Down