-
Notifications
You must be signed in to change notification settings - Fork 7.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PoC: Added initial Knowledge Graph support #1801
base: main
Are you sure you want to change the base?
Changes from all commits
d69299b
ace1466
d7908bc
93db414
b4b4cbb
d863fd4
8acf3db
f80661f
baf84c2
18ace2d
dc776d9
eb571f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# GraphStore Providers | ||
PrivateGPT supports [Neo4J](https://neo4j.com/). | ||
|
||
In order to select one or the other, set the `graphstore.database` property in the `settings.yaml` file to `neo4j`. | ||
|
||
```yaml | ||
graphstore: | ||
database: neo4j | ||
``` | ||
|
||
## Neo4j | ||
|
||
Neo4j is a graph database management system that provides an efficient and scalable solution for storing and querying graph data. | ||
|
||
### Configuration | ||
|
||
To configure Neo4j as the graph store provider, specify the following parameters in the `settings.yaml` file: | ||
|
||
```yaml | ||
graphstore: | ||
database: neo4j | ||
|
||
neo4j: | ||
url: neo4j://localhost:7687 | ||
username: neo4j | ||
password: password | ||
database: neo4j | ||
``` | ||
|
||
- **url**: The URL of the Neo4j server. | ||
- **username**: The username for accessing the Neo4j database. | ||
- **password**: The password for accessing the Neo4j database. | ||
- **database**: The name of the Neo4j database. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import logging | ||
import typing | ||
|
||
from injector import inject, singleton | ||
from llama_index.core.graph_stores.types import ( | ||
GraphStore, | ||
) | ||
from llama_index.core.indices.knowledge_graph import ( | ||
KnowledgeGraphRAGRetriever, | ||
) | ||
from llama_index.core.llms.llm import LLM | ||
from llama_index.core.storage import StorageContext | ||
|
||
from private_gpt.settings.settings import Settings | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@singleton | ||
class GraphStoreComponent: | ||
settings: Settings | ||
graph_store: GraphStore | None = None | ||
|
||
@inject | ||
def __init__(self, settings: Settings) -> None: | ||
self.settings = settings | ||
|
||
# If no graphstore is defined, return, making the graphstore optional | ||
if settings.graphstore is None: | ||
return | ||
|
||
match settings.graphstore.database: | ||
case "neo4j": | ||
try: | ||
from llama_index.graph_stores.neo4j import ( # type: ignore | ||
Neo4jGraphStore, | ||
) | ||
except ImportError as e: | ||
raise ImportError( | ||
"Neo4j dependencies not found, install with `poetry install --extras graph-stores-neo4j`" | ||
) from e | ||
|
||
if settings.neo4j is None: | ||
raise ValueError( | ||
"Neo4j settings not found. Please provide settings." | ||
) | ||
|
||
self.graph_store = typing.cast( | ||
GraphStore, | ||
Neo4jGraphStore( | ||
**settings.neo4j.model_dump(exclude_none=True), | ||
), # TODO | ||
) | ||
case _: | ||
# Should be unreachable | ||
# The settings validator should have caught this | ||
raise ValueError( | ||
f"Vectorstore database {settings.vectorstore.database} not supported" | ||
) | ||
|
||
def get_knowledge_graph( | ||
self, | ||
storage_context: StorageContext, | ||
llm: LLM, | ||
) -> KnowledgeGraphRAGRetriever: | ||
if self.graph_store is None: | ||
raise ValueError("GraphStore not defined in settings") | ||
|
||
return KnowledgeGraphRAGRetriever( | ||
storage_context=storage_context, | ||
llm=llm, | ||
verbose=True, | ||
) | ||
|
||
def close(self) -> None: | ||
if self.graph_store and hasattr(self.graph_store.client, "close"): | ||
self.graph_store.client.close() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,10 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]: | |
from llama_index.readers.file.video_audio import ( # type: ignore | ||
VideoAudioReader, | ||
) | ||
|
||
from private_gpt.components.ingest.readers.rdfreader import ( # type: ignore | ||
RDFReader, | ||
) | ||
except ImportError as e: | ||
raise ImportError("`llama-index-readers-file` package not found") from e | ||
|
||
|
@@ -48,7 +52,10 @@ def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]: | |
".mbox": MboxReader, | ||
".ipynb": IPYNBReader, | ||
} | ||
return default_file_reader_cls | ||
optional_file_reader_cls: dict[str, type[BaseReader]] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can move it back with the default readers, you are importing it unconditionally anyway |
||
".ttl": RDFReader, | ||
} | ||
return {**default_file_reader_cls, **optional_file_reader_cls} | ||
|
||
|
||
# Patching the default file reader to support other file types | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# mypy: ignore-errors | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a bit dangerous, what types were giving trouble? |
||
|
||
"""Read RDF files. | ||
|
||
This module is used to read RDF files. | ||
It was created by llama-hub but it has not been ported | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, it was ported to llama-index 0.1.0 with fixes, right? This sentence is a little bit confusing... |
||
to llama-index==0.1.0 with multiples changes to fix the code. | ||
|
||
Original code: | ||
https://github.com/run-llama/llama-hub | ||
""" | ||
|
||
import logging | ||
from pathlib import Path | ||
from typing import Any | ||
|
||
from llama_index.core.readers.base import BaseReader | ||
from llama_index.core.schema import Document | ||
from rdflib import Graph, URIRef | ||
from rdflib.namespace import RDF, RDFS | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class RDFReader(BaseReader): | ||
"""RDF reader.""" | ||
|
||
def __init__( | ||
self, | ||
*args: Any, | ||
**kwargs: Any, | ||
) -> None: | ||
"""Initialize loader.""" | ||
super().__init__(*args, **kwargs) | ||
|
||
def fetch_labels(self, uri: URIRef, graph: Graph, lang: str): | ||
"""Fetch all labels of a URI by language.""" | ||
return list( | ||
filter(lambda x: x.language in [lang, None], graph.objects(uri, RDFS.label)) | ||
) | ||
|
||
def fetch_label_in_graphs(self, uri: URIRef, lang: str = "en"): | ||
"""Fetch one label of a URI by language from the local or global graph.""" | ||
labels = self.fetch_labels(uri, self.g_local, lang) | ||
if len(labels) > 0: | ||
return labels[0].value | ||
|
||
labels = self.fetch_labels(uri, self.g_global, lang) | ||
if len(labels) > 0: | ||
return labels[0].value | ||
|
||
return str(uri) | ||
|
||
def load_data(self, file: Path, extra_info: dict | None = None) -> list[Document]: | ||
"""Parse file.""" | ||
extra_info = extra_info or {} | ||
extra_info["graph_type"] = "rdf" | ||
lang = ( | ||
extra_info["lang"] | ||
if extra_info is not None and "lang" in extra_info | ||
else "en" | ||
) | ||
|
||
self.g_local = Graph() | ||
self.g_local.parse(file) | ||
|
||
self.g_global = Graph() | ||
self.g_global.parse(str(RDF)) | ||
self.g_global.parse(str(RDFS)) | ||
|
||
text_list = [] | ||
|
||
for s, p, o in self.g_local: | ||
logger.debug("s=%s, p=%s, o=%s", s, p, o) | ||
if p == RDFS.label: | ||
continue | ||
|
||
subj_label = self.fetch_label_in_graphs(s, lang=lang) | ||
pred_label = self.fetch_label_in_graphs(p, lang=lang) | ||
obj_label = self.fetch_label_in_graphs(o, lang=lang) | ||
|
||
if subj_label is None or pred_label is None or obj_label is None: | ||
continue | ||
|
||
triple = f"<{subj_label}> " f"<{pred_label}> " f"<{obj_label}>" | ||
text_list.append(triple) | ||
|
||
text = "\n".join(text_list) | ||
return [self._text_to_document(text, extra_info)] | ||
|
||
def _text_to_document(self, text: str, extra_info: dict | None = None) -> Document: | ||
return Document(text=text, extra_info=extra_info or {}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this feels error prone, can't you use the type directly?