From d69299b21e60af018864a4ad8de632af2fee2620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Tue, 26 Mar 2024 23:18:01 +0100 Subject: [PATCH 01/12] add initial neo4j implementation --- .../components/graph_store/__init__.py | 0 .../graph_store/graph_store_component.py | 77 +++++++++++++++++++ private_gpt/settings/settings.py | 25 ++++++ pyproject.toml | 1 + settings-ollama.yaml | 9 +++ 5 files changed, 112 insertions(+) create mode 100644 private_gpt/components/graph_store/__init__.py create mode 100644 private_gpt/components/graph_store/graph_store_component.py diff --git a/private_gpt/components/graph_store/__init__.py b/private_gpt/components/graph_store/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/private_gpt/components/graph_store/graph_store_component.py b/private_gpt/components/graph_store/graph_store_component.py new file mode 100644 index 000000000..20543d92e --- /dev/null +++ b/private_gpt/components/graph_store/graph_store_component.py @@ -0,0 +1,77 @@ +import logging +import typing + +from injector import inject, singleton +from llama_index.core.graph_stores.types import ( + GraphStore, +) +from llama_index.core.indices.knowledge_graph import ( + KnowledgeGraphRAGRetriever, +) +from llama_index.core.llms.llm import LLM +from llama_index.core.storage import StorageContext + +from private_gpt.settings.settings import Settings + +logger = logging.getLogger(__name__) + + +@singleton +class GraphStoreComponent: + settings: Settings + graph_store: GraphStore + + @inject + def __init__(self, settings: Settings) -> None: + self.settings = settings + + # If no graphstore is defined, return, making the graphstore optional + if settings.graphstore is None: + return + + match settings.graphstore.database: + case "neo4j": + try: + from llama_index.graph_stores.neo4j import ( # type: ignore + Neo4jGraphStore, + ) + except ImportError as e: + raise ImportError( + "Neo4j dependencies not found, install with `poetry install --extras graph-stores-neo4j`" + ) from e + + if settings.neo4j is None: + raise ValueError( + "Neo4j settings not found. Please provide settings." + ) + + self.graph_store = typing.cast( + GraphStore, + Neo4jGraphStore( + **settings.neo4j.model_dump(exclude_none=True), + ), # TODO + ) + case _: + # Should be unreachable + # The settings validator should have caught this + raise ValueError( + f"Vectorstore database {settings.vectorstore.database} not supported" + ) + + def get_knowledge_graph( + self, + llm: LLM, + ) -> KnowledgeGraphRAGRetriever: + if self.graph_store is None: + raise ValueError("GraphStore not defined in settings") + + storage_context = StorageContext.from_defaults(graph_store=self.graph_store) + return KnowledgeGraphRAGRetriever( + storage_context=storage_context, + llm=llm, + verbose=True, + ) + + def close(self) -> None: + if hasattr(self.graph_store.client, "close"): + self.graph_store.client.close() diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 5896f00d6..4211634c7 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -114,6 +114,10 @@ class NodeStoreSettings(BaseModel): database: Literal["simple", "postgres"] +class GraphStoreSettings(BaseModel): + database: Literal["neo4j"] + + class LlamaCPPSettings(BaseModel): llm_hf_repo_id: str llm_hf_model_file: str @@ -376,6 +380,25 @@ class QdrantSettings(BaseModel): ) +class Neo4jSettings(BaseModel): + url: str | None = Field( + "bolt://localhost:7687", + description="URL of the Neo4j database.", + ) + username: str | None = Field( + "neo4j", + description="Username to connect to the Neo4j database.", + ) + password: str | None = Field( + "password", + description="Password to connect to the Neo4j database.", + ) + database: str | None = Field( + "neo4j", + description="Database name to connect to the Neo4j database.", + ) + + class Settings(BaseModel): server: ServerSettings data: DataSettings @@ -389,10 +412,12 @@ class Settings(BaseModel): ollama: OllamaSettings azopenai: AzureOpenAISettings vectorstore: VectorstoreSettings + graphstore: GraphStoreSettings nodestore: NodeStoreSettings rag: RagSettings qdrant: QdrantSettings | None = None postgres: PostgresSettings | None = None + neo4j: Neo4jSettings | None = None """ diff --git a/pyproject.toml b/pyproject.toml index d56899987..342193dd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"] vector-stores-chroma = ["llama-index-vector-stores-chroma"] vector-stores-postgres = ["llama-index-vector-stores-postgres"] storage-nodestore-postgres = ["llama-index-storage-docstore-postgres","llama-index-storage-index-store-postgres","psycopg2-binary","asyncpg"] +graph-stores-neo4j = ["llama-index-graph-stores-neo4j"] [tool.poetry.group.dev.dependencies] black = "^22" diff --git a/settings-ollama.yaml b/settings-ollama.yaml index d7e1a12ca..3037a69b7 100644 --- a/settings-ollama.yaml +++ b/settings-ollama.yaml @@ -24,5 +24,14 @@ ollama: vectorstore: database: qdrant +graphstore: + database: neo4j + qdrant: path: local_data/private_gpt/qdrant + +neo4j: + url: neo4j://localhost:7687 + username: neo4j + password: password + database: neo4j From ace1466b244180be04e0e8220f5a5a4f126121cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Tue, 26 Mar 2024 23:19:56 +0100 Subject: [PATCH 02/12] add initial rdf reader --- .../components/ingest/ingest_helper.py | 9 ++- .../components/ingest/readers/__init__.py | 0 .../components/ingest/readers/rdfreader.py | 77 +++++++++++++++++++ 3 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 private_gpt/components/ingest/readers/__init__.py create mode 100644 private_gpt/components/ingest/readers/rdfreader.py diff --git a/private_gpt/components/ingest/ingest_helper.py b/private_gpt/components/ingest/ingest_helper.py index a11090702..46666d439 100644 --- a/private_gpt/components/ingest/ingest_helper.py +++ b/private_gpt/components/ingest/ingest_helper.py @@ -27,6 +27,10 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]: from llama_index.readers.file.video_audio import ( # type: ignore VideoAudioReader, ) + + from private_gpt.components.ingest.readers.rdfreader import ( # type: ignore + RDFReader, + ) except ImportError as e: raise ImportError("`llama-index-readers-file` package not found") from e @@ -48,7 +52,10 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]: ".mbox": MboxReader, ".ipynb": IPYNBReader, } - return default_file_reader_cls + optional_file_reader_cls: dict[str, type[BaseReader]] = { + ".ttl": RDFReader, + } + return {**default_file_reader_cls, **optional_file_reader_cls} # Patching the default file reader to support other file types diff --git a/private_gpt/components/ingest/readers/__init__.py b/private_gpt/components/ingest/readers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/private_gpt/components/ingest/readers/rdfreader.py b/private_gpt/components/ingest/readers/rdfreader.py new file mode 100644 index 000000000..8096b3975 --- /dev/null +++ b/private_gpt/components/ingest/readers/rdfreader.py @@ -0,0 +1,77 @@ +# mypy: ignore-errors + +"""Read RDF files. + +This module is used to read RDF files. +It was created by llama-hub but it has not been ported +to llama-index==0.1.0 with multiples changes to fix the code. + +Original code: +https://github.com/run-llama/llama-hub +""" + +from pathlib import Path +from typing import Any + +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document +from rdflib import Graph, URIRef +from rdflib.namespace import RDF, RDFS + + +class RDFReader(BaseReader): + """RDF reader.""" + + def __init__( + self, + *args: Any, + **kwargs: Any, + ) -> None: + """Initialize loader.""" + super().__init__(*args, **kwargs) + + def fetch_labels(self, uri: URIRef, graph: Graph, lang: str): + """Fetch all labels of a URI by language.""" + return list( + filter(lambda x: x.language in [lang, None], graph.objects(uri, RDFS.label)) + ) + + def fetch_label_in_graphs(self, uri: URIRef, lang: str = "en"): + """Fetch one label of a URI by language from the local or global graph.""" + labels = self.fetch_labels(uri, self.g_local, lang) + if len(labels) > 0: + return labels[0].value + + labels = self.fetch_labels(uri, self.g_global, lang) + if len(labels) > 0: + return labels[0].value + + raise Exception(f"Label not found for: {uri}") + + def load_data(self, file: Path, extra_info: dict | None = None) -> list[Document]: + """Parse file.""" + lang = extra_info["lang"] if extra_info is not None else "en" + + self.g_local = Graph() + self.g_local.parse(file) + + self.g_global = Graph() + self.g_global.parse(str(RDF)) + self.g_global.parse(str(RDFS)) + + text_list = [] + + for s, p, o in self.g_local: + if p == RDFS.label: + continue + print(s, p, o) + triple = ( + f"<{self.fetch_label_in_graphs(s, lang=lang)}> " + f"<{self.fetch_label_in_graphs(p, lang=lang)}> " + f"<{self.fetch_label_in_graphs(o, lang=lang)}>" + ) + text_list.append(triple) + + text = "\n".join(text_list) + + return [Document(text, extra_info=extra_info)] From d7908bcefa6bd7a6b005e98e3c19ec68d3dc12d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 00:18:06 +0100 Subject: [PATCH 03/12] fix rdf reader and initial tests --- .../components/ingest/readers/rdfreader.py | 21 +- tests/server/ingest/test.ttl | 358 ++++++++++++++++++ tests/server/ingest/test_ingest_routes.py | 6 + 3 files changed, 377 insertions(+), 8 deletions(-) create mode 100644 tests/server/ingest/test.ttl diff --git a/private_gpt/components/ingest/readers/rdfreader.py b/private_gpt/components/ingest/readers/rdfreader.py index 8096b3975..3bc3ac990 100644 --- a/private_gpt/components/ingest/readers/rdfreader.py +++ b/private_gpt/components/ingest/readers/rdfreader.py @@ -46,7 +46,7 @@ def fetch_label_in_graphs(self, uri: URIRef, lang: str = "en"): if len(labels) > 0: return labels[0].value - raise Exception(f"Label not found for: {uri}") + return None # Return None if label not found def load_data(self, file: Path, extra_info: dict | None = None) -> list[Document]: """Parse file.""" @@ -64,14 +64,19 @@ def load_data(self, file: Path, extra_info: dict | None = None) -> list[Document for s, p, o in self.g_local: if p == RDFS.label: continue - print(s, p, o) - triple = ( - f"<{self.fetch_label_in_graphs(s, lang=lang)}> " - f"<{self.fetch_label_in_graphs(p, lang=lang)}> " - f"<{self.fetch_label_in_graphs(o, lang=lang)}>" - ) + + subj_label = self.fetch_label_in_graphs(s, lang=lang) + pred_label = self.fetch_label_in_graphs(p, lang=lang) + obj_label = self.fetch_label_in_graphs(o, lang=lang) + + if subj_label is None or pred_label is None or obj_label is None: + continue + + triple = f"<{subj_label}> " f"<{pred_label}> " f"<{obj_label}>" text_list.append(triple) text = "\n".join(text_list) + return [self._text_to_document(text, extra_info)] - return [Document(text, extra_info=extra_info)] + def _text_to_document(self, text: str, extra_info: dict | None = None) -> Document: + return Document(text=text, extra_info=extra_info or {}) diff --git a/tests/server/ingest/test.ttl b/tests/server/ingest/test.ttl new file mode 100644 index 000000000..99a743388 --- /dev/null +++ b/tests/server/ingest/test.ttl @@ -0,0 +1,358 @@ +@prefix ns1: . +@prefix rdfs: . + +ns1:Q1044339 rdfs:label "Valeriano Balloni" ; + ns1:hasTeam ns1:Q13385, + ns1:Q289029, + ns1:Q297430, + ns1:Q650365, + ns1:Q6767 . + +ns1:Q110992321 rdfs:label "Tim Karius" ; + ns1:hasTeam ns1:Q1387210, + ns1:Q655591 . + +ns1:Q12402730 rdfs:label "Xoaquín Álvarez Corbacho" ; + ns1:hasTeam ns1:Q8749 . + +ns1:Q12813965 rdfs:label "József Cserháti" ; + ns1:hasTeam ns1:Q732885 . + +ns1:Q13101502 rdfs:label "Alphonse Weicker" ; + ns1:hasTeam ns1:Q184266, + ns1:Q693092 . + +ns1:Q1497593 rdfs:label "Gediminas Budnikas" ; + ns1:hasTeam ns1:Q393357 . + +ns1:Q1531063 rdfs:label "Glenn W. Harrison" ; + ns1:hasTeam ns1:Q1034556 . + +ns1:Q16081110 rdfs:label "Moon Hyung-pyo" ; + ns1:hasTeam ns1:Q39988 . + +ns1:Q16091117 rdfs:label "Lee Ju-yeol" ; + ns1:hasTeam ns1:Q39988 . + +ns1:Q16299411 rdfs:label "Juha Joenväärä" ; + ns1:hasTeam ns1:Q1130636, + ns1:Q1232297 . + +ns1:Q1686485 rdfs:label "Jeff Immelt" ; + ns1:hasTeam ns1:Q5225674 . + +ns1:Q16942062 rdfs:label "Tito Montaño" ; + ns1:hasTeam ns1:Q127925 . + +ns1:Q1776728 rdfs:label "Svein Gjedrem" ; + ns1:hasTeam ns1:Q737937 . + +ns1:Q17917747 rdfs:label "Noel Newton Nethersole" ; + ns1:hasTeam ns1:Q3590248 . + +ns1:Q18541191 rdfs:label "Adalbert Kassai" ; + ns1:hasTeam ns1:Q1135735, + ns1:Q1195647, + ns1:Q1386940, + ns1:Q1689705, + ns1:Q841245, + ns1:Q842134 . + +ns1:Q18562973 rdfs:label "István Hagelmayer" ; + ns1:hasTeam ns1:Q606773 . + +ns1:Q192533 rdfs:label "Mark Carney" ; + ns1:hasTeam ns1:Q5676342 . + +ns1:Q1930105 rdfs:label "Michaela Vosbeck" ; + ns1:hasTeam ns1:Q1715018, + ns1:Q1792079, + ns1:Q2931573, + ns1:Q300032 . + +ns1:Q202693 rdfs:label "Jo Nesbø" ; + ns1:hasTeam ns1:Q208552 . + +ns1:Q2055385 rdfs:label "Alexandre Baptista" ; + ns1:hasTeam ns1:Q267245, + ns1:Q75729 . + +ns1:Q22003558 rdfs:label "Colin Cannonier" ; + ns1:hasTeam ns1:Q3590581 . + +ns1:Q2535499 rdfs:label "Tadao Horie" ; + ns1:hasTeam ns1:Q170566 . + +ns1:Q27491470 rdfs:label "Telesfor Banaszkiewicz" ; + ns1:hasTeam ns1:Q11821053, + ns1:Q1198772 . + +ns1:Q30308976 rdfs:label "Thomas Howden Fraser" ; + ns1:hasTeam ns1:Q117467 . + +ns1:Q311025 rdfs:label "Henry Paulson" ; + ns1:hasTeam ns1:Q5225674 . + +ns1:Q3132658 rdfs:label "Henry Braddon" ; + ns1:hasTeam ns1:Q55801 . + +ns1:Q313682 rdfs:label "Oleguer Presas" ; + ns1:hasTeam ns1:Q10467, + ns1:Q17228, + ns1:Q2220788, + ns1:Q7156, + ns1:Q81888 . + +ns1:Q3470333 rdfs:label "Salvador Servià i Costa" ; + ns1:hasTeam ns1:Q188217, + ns1:Q35896 . + +ns1:Q354317 rdfs:label "Vebjørn Rodal" ; + ns1:hasTeam ns1:Q11993950 . + +ns1:Q3592042 rdfs:label "Étienne Antonelli" ; + ns1:hasTeam ns1:Q132885 . + +ns1:Q3808555 rdfs:label "Joan Trayter" ; + ns1:hasTeam ns1:Q3091261 . + +ns1:Q4011129 rdfs:label "Vicente Locaso" ; + ns1:hasTeam ns1:Q15799, + ns1:Q18640, + ns1:Q327172, + ns1:Q79800 . + +ns1:Q457755 rdfs:label "Alfred Lawson" ; + ns1:hasTeam ns1:Q461595, + ns1:Q653772 . + +ns1:Q4908745 rdfs:label "Bill Demory" ; + ns1:hasTeam ns1:Q219602, + ns1:Q4791461 . + +ns1:Q4939229 rdfs:label "Bolesław Banaś" ; + ns1:hasTeam ns1:Q3593958 . + +ns1:Q4961008 rdfs:label "Brendan Menton, Sr." ; + ns1:hasTeam ns1:Q629300 . + +ns1:Q4968933 rdfs:label "Rune Gerhardsen" ; + ns1:hasTeam ns1:Q2042878 . + +ns1:Q5405396 rdfs:label "Alejandro Brand" ; + ns1:hasTeam ns1:Q212564, + ns1:Q391984 . + +ns1:Q559712 rdfs:label "Magomedsalam Magomedov" ; + ns1:hasTeam ns1:Q2494171 . + +ns1:Q60735037 rdfs:label "Peter Morgan" ; + ns1:hasTeam ns1:Q18516 . + +ns1:Q6148645 rdfs:label "Tomás Soley Güell" ; + ns1:hasTeam ns1:Q7156 . + +ns1:Q65624037 rdfs:label "Thomas Staub" ; + ns1:hasTeam ns1:Q201969 . + +ns1:Q6708659 rdfs:label "Lyndhurst Falkiner Giblin" ; + ns1:hasTeam ns1:Q378628 . + +ns1:Q7172847 rdfs:label "Peter Henry" ; + ns1:hasTeam ns1:Q7054630 . + +ns1:Q7193582 rdfs:label "Pike Curtin" ; + ns1:hasTeam ns1:Q3589750 . + +ns1:Q732476 rdfs:label "Xavier Sala-i-Martin" ; + ns1:hasTeam ns1:Q3091261 . + +ns1:Q7436183 rdfs:label "Scott Cowen" ; + ns1:hasTeam ns1:Q16959086 . + +ns1:Q75748 rdfs:label "Hans Tietmeyer" ; + ns1:hasTeam ns1:Q2385504 . + +ns1:Q769073 rdfs:label "W. Morrissey" ; + ns1:hasTeam ns1:Q2367373 . + +ns1:Q84218605 rdfs:label "José María Echevarría Arteche" ; + ns1:hasTeam ns1:Q1103198 . + +ns1:Q8667562 rdfs:label "Valerijonas Balčiūnas" ; + ns1:hasTeam ns1:Q186276 . + +ns1:Q89141301 rdfs:label "Anna Potok" ; + ns1:hasTeam ns1:Q4841 . + +ns1:Q9199508 rdfs:label "Czesława Pilarska" ; + ns1:hasTeam ns1:Q11733016 . + +ns1:Q947814 rdfs:label "Steinar Hoen" ; + ns1:hasTeam ns1:Q4573629 . + +ns1:Q963421 rdfs:label "Carl-Henric Svanberg" ; + ns1:hasTeam ns1:Q1653574 . + +ns1:Q98072140 rdfs:label "Q98072140" ; + ns1:hasTeam ns1:Q28214543 . + +ns1:Q1034556 rdfs:label "Hawthorn Football Club" . + +ns1:Q10467 rdfs:label "FC Barcelona Atlètic" . + +ns1:Q1103198 rdfs:label "Club de Campo Villa de Madrid" . + +ns1:Q1130636 rdfs:label "Oulun Kärpät" . + +ns1:Q1135735 rdfs:label "CS Corvinul Hunedoara" . + +ns1:Q11733016 rdfs:label "Stilon Gorzów Wielkopolski" . + +ns1:Q117467 rdfs:label "Royal Society of Edinburgh" . + +ns1:Q11821053 rdfs:label "Q11821053" . + +ns1:Q1195647 rdfs:label "FC Progresul București" . + +ns1:Q1198772 rdfs:label "Warta Poznań" . + +ns1:Q11993950 rdfs:label "Oppdal IL" . + +ns1:Q1232297 rdfs:label "Djurgårdens IF Hockey" . + +ns1:Q127925 rdfs:label "Club Aurora" . + +ns1:Q132885 rdfs:label "Olympique de Marseille" . + +ns1:Q13385 rdfs:label "Società Polisportiva Ars et Labor" . + +ns1:Q1386940 rdfs:label "FC Bihor Oradea" . + +ns1:Q1387210 rdfs:label "FC Jeunesse Canach" . + +ns1:Q15799 rdfs:label "Club Atlético River Plate" . + +ns1:Q1653574 rdfs:label "IF Björklöven" . + +ns1:Q1689705 rdfs:label "FC Jiul Petroșani" . + +ns1:Q16959086 rdfs:label "UConn Huskies football" . + +ns1:Q170566 rdfs:label "Japan national football team" . + +ns1:Q1715018 rdfs:label "TV Hörde" . + +ns1:Q17228 rdfs:label "Catalonia national football team" . + +ns1:Q1792079 rdfs:label "VC Schwerte" . + +ns1:Q184266 rdfs:label "Luxembourg national football team" . + +ns1:Q18516 rdfs:label "Hereford United F.C." . + +ns1:Q186276 rdfs:label "Lithuania national football team" . + +ns1:Q18640 rdfs:label "Gimnasia y Esgrima La Plata" . + +ns1:Q188217 rdfs:label "SEAT" . + +ns1:Q201969 rdfs:label "FC Winterthur" . + +ns1:Q2042878 rdfs:label "Aktiv SK" . + +ns1:Q208552 rdfs:label "Molde FK" . + +ns1:Q212564 rdfs:label "Colombia national football team" . + +ns1:Q219602 rdfs:label "New York Jets" . + +ns1:Q2220788 rdfs:label "UDA Gramenet" . + +ns1:Q2367373 rdfs:label "NYU Violets" . + +ns1:Q2385504 rdfs:label "Q2385504" . + +ns1:Q2494171 rdfs:label "FC Dynamo Makhachkala" . + +ns1:Q267245 rdfs:label "Portugal national association football team" . + +ns1:Q28214543 rdfs:label "Trabzonspor" . + +ns1:Q289029 rdfs:label "U.S. Ancona" . + +ns1:Q2931573 rdfs:label "CJD Feuerbach" . + +ns1:Q297430 rdfs:label "S.S. Arezzo" . + +ns1:Q300032 rdfs:label "Germany women's national volleyball team" . + +ns1:Q327172 rdfs:label "Club Atlético Huracán" . + +ns1:Q35896 rdfs:label "Lancia" . + +ns1:Q3589750 rdfs:label "Western Australia cricket team" . + +ns1:Q3590248 rdfs:label "Jamaica national cricket team" . + +ns1:Q3590581 rdfs:label "Leeward Islands cricket team" . + +ns1:Q3593958 rdfs:label "ŁKS Łódź" . + +ns1:Q378628 rdfs:label "England national rugby union team" . + +ns1:Q391984 rdfs:label "Millonarios" . + +ns1:Q393357 rdfs:label "BC Žalgiris" . + +ns1:Q4573629 rdfs:label "IK Tjalve" . + +ns1:Q461595 rdfs:label "Atlanta Braves" . + +ns1:Q4791461 rdfs:label "Arizona Wildcats football" . + +ns1:Q4841 rdfs:label "Lech Poznań" . + +ns1:Q55801 rdfs:label "New Zealand national rugby union team" . + +ns1:Q5676342 rdfs:label "Harvard Crimson men's ice hockey" . + +ns1:Q606773 rdfs:label "Dorogi FC" . + +ns1:Q629300 rdfs:label "Home Farm F.C." . + +ns1:Q650365 rdfs:label "Carrarese Calcio" . + +ns1:Q653772 rdfs:label "Pittsburgh Pirates" . + +ns1:Q655591 rdfs:label "FC Koeppchen Wormeldange" . + +ns1:Q6767 rdfs:label "U.S. Livorno 1915" . + +ns1:Q693092 rdfs:label "Racing FC Union Luxembourg" . + +ns1:Q7054630 rdfs:label "North Carolina Tar Heels football" . + +ns1:Q732885 rdfs:label "Salgótarjáni BTC" . + +ns1:Q737937 rdfs:label "Lyn 1896 FK" . + +ns1:Q75729 rdfs:label "Sporting CP" . + +ns1:Q79800 rdfs:label "Argentina national association football team" . + +ns1:Q81888 rdfs:label "AFC Ajax" . + +ns1:Q841245 rdfs:label "FC Argeș" . + +ns1:Q842134 rdfs:label "FC Sportul Studențesc București" . + +ns1:Q8749 rdfs:label "RC Celta de Vigo" . + +ns1:Q3091261 rdfs:label "FC Barcelona" . + +ns1:Q39988 rdfs:label "Yonsei University" . + +ns1:Q5225674 rdfs:label "Dartmouth Big Green football" . + +ns1:Q7156 rdfs:label "FC Barcelona" . diff --git a/tests/server/ingest/test_ingest_routes.py b/tests/server/ingest/test_ingest_routes.py index 896410a17..3bdba834c 100644 --- a/tests/server/ingest/test_ingest_routes.py +++ b/tests/server/ingest/test_ingest_routes.py @@ -19,6 +19,12 @@ def test_ingest_accepts_pdf_files(ingest_helper: IngestHelper) -> None: assert len(ingest_result.data) == 1 +def test_ingest_accepts_ttf_files(ingest_helper: IngestHelper) -> None: + path = Path(__file__).parents[0] / "test.ttl" + ingest_result = ingest_helper.ingest_file(path) + assert len(ingest_result.data) == 1 + + def test_ingest_list_returns_something_after_ingestion( test_client: TestClient, ingest_helper: IngestHelper ) -> None: From 93db41483a3a46b849218945b2114326e02f69f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 18:26:20 +0100 Subject: [PATCH 04/12] more fixes --- private_gpt/components/ingest/readers/rdfreader.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/private_gpt/components/ingest/readers/rdfreader.py b/private_gpt/components/ingest/readers/rdfreader.py index 3bc3ac990..eda34d3a4 100644 --- a/private_gpt/components/ingest/readers/rdfreader.py +++ b/private_gpt/components/ingest/readers/rdfreader.py @@ -10,6 +10,7 @@ https://github.com/run-llama/llama-hub """ +import logging from pathlib import Path from typing import Any @@ -18,6 +19,8 @@ from rdflib import Graph, URIRef from rdflib.namespace import RDF, RDFS +logger = logging.getLogger(__name__) + class RDFReader(BaseReader): """RDF reader.""" @@ -46,11 +49,17 @@ def fetch_label_in_graphs(self, uri: URIRef, lang: str = "en"): if len(labels) > 0: return labels[0].value - return None # Return None if label not found + return str(uri) def load_data(self, file: Path, extra_info: dict | None = None) -> list[Document]: """Parse file.""" - lang = extra_info["lang"] if extra_info is not None else "en" + extra_info = extra_info or {} + extra_info["graph_type"] = "rdf" + lang = ( + extra_info["lang"] + if extra_info is not None and "lang" in extra_info + else "en" + ) self.g_local = Graph() self.g_local.parse(file) @@ -62,6 +71,7 @@ def load_data(self, file: Path, extra_info: dict | None = None) -> list[Document text_list = [] for s, p, o in self.g_local: + logger.debug("s=%s, p=%s, o=%s", s, p, o) if p == RDFS.label: continue From b4b4cbbc3a2f227674e66741defb35588b4ec199 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 19:11:30 +0100 Subject: [PATCH 05/12] add graph store to chat, chunks and ingest services --- .../components/graph_store/graph_store_component.py | 6 +++--- private_gpt/server/chat/chat_service.py | 7 +++++++ private_gpt/server/chunks/chunks_service.py | 5 +++++ private_gpt/server/ingest/ingest_service.py | 5 +++++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/private_gpt/components/graph_store/graph_store_component.py b/private_gpt/components/graph_store/graph_store_component.py index 20543d92e..2c646d8d9 100644 --- a/private_gpt/components/graph_store/graph_store_component.py +++ b/private_gpt/components/graph_store/graph_store_component.py @@ -19,7 +19,7 @@ @singleton class GraphStoreComponent: settings: Settings - graph_store: GraphStore + graph_store: GraphStore | None = None @inject def __init__(self, settings: Settings) -> None: @@ -60,12 +60,12 @@ def __init__(self, settings: Settings) -> None: def get_knowledge_graph( self, + storage_context: StorageContext, llm: LLM, ) -> KnowledgeGraphRAGRetriever: if self.graph_store is None: raise ValueError("GraphStore not defined in settings") - storage_context = StorageContext.from_defaults(graph_store=self.graph_store) return KnowledgeGraphRAGRetriever( storage_context=storage_context, llm=llm, @@ -73,5 +73,5 @@ def get_knowledge_graph( ) def close(self) -> None: - if hasattr(self.graph_store.client, "close"): + if self.graph_store and hasattr(self.graph_store.client, "close"): self.graph_store.client.close() diff --git a/private_gpt/server/chat/chat_service.py b/private_gpt/server/chat/chat_service.py index ea57f2c0d..a84b96b99 100644 --- a/private_gpt/server/chat/chat_service.py +++ b/private_gpt/server/chat/chat_service.py @@ -16,6 +16,7 @@ from pydantic import BaseModel from private_gpt.components.embedding.embedding_component import EmbeddingComponent +from private_gpt.components.graph_store.graph_store_component import GraphStoreComponent from private_gpt.components.llm.llm_component import LLMComponent from private_gpt.components.node_store.node_store_component import NodeStoreComponent from private_gpt.components.vector_store.vector_store_component import ( @@ -82,6 +83,7 @@ def __init__( vector_store_component: VectorStoreComponent, embedding_component: EmbeddingComponent, node_store_component: NodeStoreComponent, + graph_store_component: GraphStoreComponent, ) -> None: self.settings = settings self.llm_component = llm_component @@ -89,6 +91,9 @@ def __init__( self.vector_store_component = vector_store_component self.storage_context = StorageContext.from_defaults( vector_store=vector_store_component.vector_store, + graph_store=graph_store_component.graph_store + if graph_store_component and graph_store_component.graph_store + else None, docstore=node_store_component.doc_store, index_store=node_store_component.index_store, ) @@ -99,6 +104,8 @@ def __init__( embed_model=embedding_component.embedding_model, show_progress=True, ) + self.graph_store_component = graph_store_component + self.knowledge_graph_index = graph_store_component.graph_store def _chat_engine( self, diff --git a/private_gpt/server/chunks/chunks_service.py b/private_gpt/server/chunks/chunks_service.py index 7bda5d904..26a1e7a02 100644 --- a/private_gpt/server/chunks/chunks_service.py +++ b/private_gpt/server/chunks/chunks_service.py @@ -7,6 +7,7 @@ from pydantic import BaseModel, Field from private_gpt.components.embedding.embedding_component import EmbeddingComponent +from private_gpt.components.graph_store.graph_store_component import GraphStoreComponent from private_gpt.components.llm.llm_component import LLMComponent from private_gpt.components.node_store.node_store_component import NodeStoreComponent from private_gpt.components.vector_store.vector_store_component import ( @@ -60,6 +61,7 @@ def __init__( self, llm_component: LLMComponent, vector_store_component: VectorStoreComponent, + graph_store_component: GraphStoreComponent, embedding_component: EmbeddingComponent, node_store_component: NodeStoreComponent, ) -> None: @@ -68,6 +70,9 @@ def __init__( self.embedding_component = embedding_component self.storage_context = StorageContext.from_defaults( vector_store=vector_store_component.vector_store, + graph_store=graph_store_component.graph_store + if graph_store_component and graph_store_component.graph_store + else None, docstore=node_store_component.doc_store, index_store=node_store_component.index_store, ) diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py index f9ae4728f..544708579 100644 --- a/private_gpt/server/ingest/ingest_service.py +++ b/private_gpt/server/ingest/ingest_service.py @@ -8,6 +8,7 @@ from llama_index.core.storage import StorageContext from private_gpt.components.embedding.embedding_component import EmbeddingComponent +from private_gpt.components.graph_store.graph_store_component import GraphStoreComponent from private_gpt.components.ingest.ingest_component import get_ingestion_component from private_gpt.components.llm.llm_component import LLMComponent from private_gpt.components.node_store.node_store_component import NodeStoreComponent @@ -30,12 +31,16 @@ def __init__( self, llm_component: LLMComponent, vector_store_component: VectorStoreComponent, + graph_store_component: GraphStoreComponent, embedding_component: EmbeddingComponent, node_store_component: NodeStoreComponent, ) -> None: self.llm_service = llm_component self.storage_context = StorageContext.from_defaults( vector_store=vector_store_component.vector_store, + graph_store=graph_store_component.graph_store + if graph_store_component and graph_store_component.graph_store + else None, docstore=node_store_component.doc_store, index_store=node_store_component.index_store, ) From d863fd4a755bc0e5fb99f114839e227f6cbd1272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 20:11:49 +0100 Subject: [PATCH 06/12] allow to save knowledge graph --- .../components/ingest/ingest_component.py | 47 +++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/private_gpt/components/ingest/ingest_component.py b/private_gpt/components/ingest/ingest_component.py index 5ed039590..a10ba218e 100644 --- a/private_gpt/components/ingest/ingest_component.py +++ b/private_gpt/components/ingest/ingest_component.py @@ -9,11 +9,16 @@ from queue import Queue from typing import Any +from llama_index.core import KnowledgeGraphIndex from llama_index.core.data_structs import IndexDict from llama_index.core.embeddings.utils import EmbedType -from llama_index.core.indices import VectorStoreIndex, load_index_from_storage +from llama_index.core.indices import ( + VectorStoreIndex, + load_index_from_storage, +) from llama_index.core.indices.base import BaseIndex from llama_index.core.ingestion import run_transformations +from llama_index.core.llms.llm import LLM from llama_index.core.schema import BaseNode, Document, TransformComponent from llama_index.core.storage import StorageContext @@ -67,9 +72,13 @@ def __init__( self._index_thread_lock = ( threading.Lock() ) # Thread lock! Not Multiprocessing lock - self._index = self._initialize_index() + self._index = self._initialize_index(**kwargs) + self._knowledge_graph = self._initialize_knowledge_graph(**kwargs) - def _initialize_index(self) -> BaseIndex[IndexDict]: + def _initialize_index( + self, + llm: LLM, + ) -> BaseIndex[IndexDict]: """Initialize the index from the storage context.""" try: # Load the index with store_nodes_override=True to be able to delete them @@ -79,6 +88,7 @@ def _initialize_index(self) -> BaseIndex[IndexDict]: show_progress=self.show_progress, embed_model=self.embed_model, transformations=self.transformations, + llm=llm, ) except ValueError: # There are no index in the storage context, creating a new one @@ -94,9 +104,34 @@ def _initialize_index(self) -> BaseIndex[IndexDict]: index.storage_context.persist(persist_dir=local_data_path) return index + def _initialize_knowledge_graph( + self, + llm: LLM, + max_triplets_per_chunk: int = 10, + include_embeddings: bool = True, + ) -> KnowledgeGraphIndex: + """Initialize the index from the storage context.""" + index = KnowledgeGraphIndex.from_documents( + [], + storage_context=self.storage_context, + show_progress=self.show_progress, + embed_model=self.embed_model, + transformations=self.transformations, + llm=llm, + max_triplets_per_chunk=max_triplets_per_chunk, + include_embeddings=include_embeddings, + ) + index.storage_context.persist(persist_dir=local_data_path) + return index + def _save_index(self) -> None: + logger.debug("Persisting the index") self._index.storage_context.persist(persist_dir=local_data_path) + def _save_knowledge_graph(self) -> None: + logger.debug("Persisting the knowledge graph") + self._knowledge_graph.storage_context.persist(persist_dir=local_data_path) + def delete(self, doc_id: str) -> None: with self._index_thread_lock: # Delete the document from the index @@ -105,6 +140,12 @@ def delete(self, doc_id: str) -> None: # Save the index self._save_index() + # Delete the document from the knowledge graph + self._knowledge_graph.delete_ref_doc(doc_id, delete_from_docstore=True) + + # Save the knowledge graph + self._save_knowledge_graph() + class SimpleIngestComponent(BaseIngestComponentWithIndex): def __init__( From 8acf3db0e6ee9c5583f9f82e2a28cd4f1742d8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 20:12:27 +0100 Subject: [PATCH 07/12] fix openai default --- private_gpt/components/ingest/ingest_component.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/private_gpt/components/ingest/ingest_component.py b/private_gpt/components/ingest/ingest_component.py index a10ba218e..393ab7fd1 100644 --- a/private_gpt/components/ingest/ingest_component.py +++ b/private_gpt/components/ingest/ingest_component.py @@ -526,6 +526,8 @@ def get_ingestion_component( embed_model: EmbedType, transformations: list[TransformComponent], settings: Settings, + *args: Any, + **kwargs: Any, ) -> BaseIngestComponent: """Get the ingestion component for the given configuration.""" ingest_mode = settings.embedding.ingest_mode @@ -535,6 +537,7 @@ def get_ingestion_component( embed_model=embed_model, transformations=transformations, count_workers=settings.embedding.count_workers, + llm=kwargs.get("llm"), ) elif ingest_mode == "parallel": return ParallelizedIngestComponent( @@ -542,6 +545,7 @@ def get_ingestion_component( embed_model=embed_model, transformations=transformations, count_workers=settings.embedding.count_workers, + llm=kwargs.get("llm"), ) elif ingest_mode == "pipeline": return PipelineIngestComponent( @@ -549,10 +553,12 @@ def get_ingestion_component( embed_model=embed_model, transformations=transformations, count_workers=settings.embedding.count_workers, + llm=kwargs.get("llm"), ) else: return SimpleIngestComponent( storage_context=storage_context, embed_model=embed_model, transformations=transformations, + llm=kwargs.get("llm"), ) From f80661f7642f46e76ac7108acb3111f792ea83bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 20:14:34 +0100 Subject: [PATCH 08/12] add graph and router retrievers instead just vector retriever --- private_gpt/server/chat/chat_service.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/private_gpt/server/chat/chat_service.py b/private_gpt/server/chat/chat_service.py index a84b96b99..584073b9b 100644 --- a/private_gpt/server/chat/chat_service.py +++ b/private_gpt/server/chat/chat_service.py @@ -11,7 +11,10 @@ from llama_index.core.postprocessor import ( SimilarityPostprocessor, ) +from llama_index.core.retrievers.router_retriever import RouterRetriever +from llama_index.core.selectors import LLMSingleSelector from llama_index.core.storage import StorageContext +from llama_index.core.tools.retriever_tool import RetrieverTool from llama_index.core.types import TokenGen from pydantic import BaseModel @@ -120,9 +123,28 @@ def _chat_engine( context_filter=context_filter, similarity_top_k=self.settings.rag.similarity_top_k, ) + graph_knowledge_retrevier = self.graph_store_component.get_knowledge_graph( + llm=self.llm_component.llm, + storage_context=self.storage_context, + ) + + retrievers = [ + r for r in [vector_index_retriever, graph_knowledge_retrevier] if r + ] + retriever = RouterRetriever.from_defaults( + retriever_tools=[ + RetrieverTool.from_defaults(retriever) for retriever in retrievers + ], + llm=self.llm_component.llm, + selector=LLMSingleSelector.from_defaults( + llm=self.llm_component.llm + ), # TODO: Could be LLMMultiSelector if needed + select_multi=len(retrievers) > 1, + ) + return ContextChatEngine.from_defaults( system_prompt=system_prompt, - retriever=vector_index_retriever, + retriever=retriever, llm=self.llm_component.llm, # Takes no effect at the moment node_postprocessors=[ MetadataReplacementPostProcessor(target_metadata_key="window"), From baf84c2f51db2be9fe2f7ce89c638ca90edb6d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 20:15:30 +0100 Subject: [PATCH 09/12] allow to save knowledge graph in SimpleIngestComponent --- .../components/ingest/ingest_component.py | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/private_gpt/components/ingest/ingest_component.py b/private_gpt/components/ingest/ingest_component.py index 393ab7fd1..54010d924 100644 --- a/private_gpt/components/ingest/ingest_component.py +++ b/private_gpt/components/ingest/ingest_component.py @@ -179,14 +179,35 @@ def bulk_ingest(self, files: list[tuple[str, Path]]) -> list[Document]: def _save_docs(self, documents: list[Document]) -> list[Document]: logger.debug("Transforming count=%s documents into nodes", len(documents)) with self._index_thread_lock: - for document in documents: - self._index.insert(document, show_progress=True) - logger.debug("Persisting the index and nodes") - # persist the index and nodes - self._save_index() + logger.debug("Persisting the index and nodes in the vector store") + self._save_to_index(documents) + + logger.debug("Persisting the index and nodes in the knowledge graph") + self._save_to_knowledge_graph(documents) + logger.debug("Persisted the index and nodes") return documents + def _save_to_index(self, documents: list[Document]) -> None: + logger.debug("Inserting count=%s documents in the index", len(documents)) + for document in documents: + logger.info("Inserting document=%s in the index", document) + self._index.insert(document, show_progress=True) + self._save_index() + pass + + def _save_to_knowledge_graph(self, documents: list[Document]) -> None: + logger.debug( + "Inserting count=%s documents in the knowledge graph", len(documents) + ) + for document in [ + d for d in documents if d.extra_info.get("graph_type", None) is not None + ]: + logger.info("Inserting document=%s in the knowledge graph", document) + logger.info("Document=%s", document.extra_info) + self._knowledge_graph.insert(document, show_progress=True) + self._save_knowledge_graph() + class BatchIngestComponent(BaseIngestComponentWithIndex): """Parallelize the file reading and parsing on multiple CPU core. From 18ace2d593f717862c91f6f2dca4b5134b902479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 20:15:54 +0100 Subject: [PATCH 10/12] add missing configurations --- private_gpt/server/ingest/ingest_service.py | 1 + settings-ollama.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/private_gpt/server/ingest/ingest_service.py b/private_gpt/server/ingest/ingest_service.py index 544708579..0243c0e91 100644 --- a/private_gpt/server/ingest/ingest_service.py +++ b/private_gpt/server/ingest/ingest_service.py @@ -51,6 +51,7 @@ def __init__( embed_model=embedding_component.embedding_model, transformations=[node_parser, embedding_component.embedding_model], settings=settings(), + llm=self.llm_service.llm, ) def _ingest_data(self, file_name: str, file_data: AnyStr) -> list[IngestedDoc]: diff --git a/settings-ollama.yaml b/settings-ollama.yaml index 3037a69b7..fabaf5dfa 100644 --- a/settings-ollama.yaml +++ b/settings-ollama.yaml @@ -9,6 +9,7 @@ llm: embedding: mode: ollama + ingest_mode: simple # Enabled simple that stores the embeddings and triples in the graphstore. ollama: llm_model: mistral From dc776d91a0a1a1ed3b59aa7d32100877f1046a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 20:18:18 +0100 Subject: [PATCH 11/12] add initial knowledge graph documentation --- fern/docs/pages/manual/knowledge-graph.mdx | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 fern/docs/pages/manual/knowledge-graph.mdx diff --git a/fern/docs/pages/manual/knowledge-graph.mdx b/fern/docs/pages/manual/knowledge-graph.mdx new file mode 100644 index 000000000..73ded1289 --- /dev/null +++ b/fern/docs/pages/manual/knowledge-graph.mdx @@ -0,0 +1,33 @@ +# GraphStore Providers +PrivateGPT supports [Neo4J](https://neo4j.com/). + +In order to select one or the other, set the `graphstore.database` property in the `settings.yaml` file to `neo4j`. + +```yaml +graphstore: + database: neo4j +``` + +## Neo4j + +Neo4j is a graph database management system that provides an efficient and scalable solution for storing and querying graph data. + +### Configuration + +To configure Neo4j as the graph store provider, specify the following parameters in the `settings.yaml` file: + +```yaml +graphstore: + database: neo4j + +neo4j: + url: neo4j://localhost:7687 + username: neo4j + password: password + database: neo4j +``` + +- **url**: The URL of the Neo4j server. +- **username**: The username for accessing the Neo4j database. +- **password**: The password for accessing the Neo4j database. +- **database**: The name of the Neo4j database. From eb571f5641fe26bf6f157be7f26bc16972de3551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Marti=CC=81nez=20A=CC=81lvarez?= Date: Wed, 27 Mar 2024 20:34:00 +0100 Subject: [PATCH 12/12] make graph optional --- private_gpt/settings/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 4211634c7..fc9be5189 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -412,7 +412,7 @@ class Settings(BaseModel): ollama: OllamaSettings azopenai: AzureOpenAISettings vectorstore: VectorstoreSettings - graphstore: GraphStoreSettings + graphstore: GraphStoreSettings | None = None nodestore: NodeStoreSettings rag: RagSettings qdrant: QdrantSettings | None = None