You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to ingest a huge dataset (30Gb, 340k files) and finally got this error:
Traceback (most recent call last): File "F:\LLM\privateGPT\scripts\ingest_folder.py", line 102, in <module> worker.ingest_folder(root_path, args.ignored) File "F:\LLM\privateGPT\scripts\ingest_folder.py", line 38, in ingest_folder self._ingest_all(self._files_under_root_folder) File "F:\LLM\privateGPT\scripts\ingest_folder.py", line 42, in _ingest_all self.ingest_service.bulk_ingest([(str(p.name), p) for p in files_to_ingest]) File "F:\LLM\privateGPT\private_gpt\server\ingest\ingest_service.py", line 92, in bulk_ingest documents = self.ingest_component.bulk_ingest(files) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "F:\LLM\privateGPT\private_gpt\components\ingest\ingest_component.py", line 130, in bulk_ingest saved_documents.extend(self._save_docs(documents)) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "F:\LLM\privateGPT\private_gpt\components\ingest\ingest_component.py", line 137, in _save_docs self._index.insert(document, show_progress=True) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\indices\base.py", line 197, in insert self.insert_nodes(nodes, **insert_kwargs) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\indices\vector_store\base.py", line 267, in insert_nodes self._insert(nodes, **insert_kwargs) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\indices\vector_store\base.py", line 258, in _insert self._add_nodes_to_index(self._index_struct, nodes, **insert_kwargs) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\indices\vector_store\base.py", line 189, in _add_nodes_to_index new_ids = self._vector_store.add(nodes, **insert_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\vector_stores\qdrant.py", line 127, in add self._client.upsert( File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\qdrant_client.py", line 987, in upsert return self._client.upsert( ^^^^^^^^^^^^^^^^^^^^ File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\local\qdrant_local.py", line 421, in upsert collection.upsert(points) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\local\local_collection.py", line 1090, in upsert self._upsert_point( File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\local\local_collection.py", line 1070, in _upsert_point self.storage.persist(point) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\local\persistence.py", line 119, in persist cursor.execute( sqlite3.DataError: string or blob too big
Which obviously means there are too much data and I am out of SQLite limits.
So, could anyone help me to answer the following questions:
Does it make sense to ingest such an enormous amount of data?
Can I somehow fix the error above? Does it make sense to use another DB/DB engine?
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
Hi,
I am trying to ingest a huge dataset (30Gb, 340k files) and finally got this error:
Traceback (most recent call last): File "F:\LLM\privateGPT\scripts\ingest_folder.py", line 102, in <module> worker.ingest_folder(root_path, args.ignored) File "F:\LLM\privateGPT\scripts\ingest_folder.py", line 38, in ingest_folder self._ingest_all(self._files_under_root_folder) File "F:\LLM\privateGPT\scripts\ingest_folder.py", line 42, in _ingest_all self.ingest_service.bulk_ingest([(str(p.name), p) for p in files_to_ingest]) File "F:\LLM\privateGPT\private_gpt\server\ingest\ingest_service.py", line 92, in bulk_ingest documents = self.ingest_component.bulk_ingest(files) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "F:\LLM\privateGPT\private_gpt\components\ingest\ingest_component.py", line 130, in bulk_ingest saved_documents.extend(self._save_docs(documents)) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "F:\LLM\privateGPT\private_gpt\components\ingest\ingest_component.py", line 137, in _save_docs self._index.insert(document, show_progress=True) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\indices\base.py", line 197, in insert self.insert_nodes(nodes, **insert_kwargs) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\indices\vector_store\base.py", line 267, in insert_nodes self._insert(nodes, **insert_kwargs) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\indices\vector_store\base.py", line 258, in _insert self._add_nodes_to_index(self._index_struct, nodes, **insert_kwargs) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\indices\vector_store\base.py", line 189, in _add_nodes_to_index new_ids = self._vector_store.add(nodes, **insert_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\llama_index\vector_stores\qdrant.py", line 127, in add self._client.upsert( File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\qdrant_client.py", line 987, in upsert return self._client.upsert( ^^^^^^^^^^^^^^^^^^^^ File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\local\qdrant_local.py", line 421, in upsert collection.upsert(points) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\local\local_collection.py", line 1090, in upsert self._upsert_point( File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\local\local_collection.py", line 1070, in _upsert_point self.storage.persist(point) File "C:\Users\vasilii\AppData\Local\pypoetry\Cache\virtualenvs\private-gpt-GIMTD_9E-py3.11\Lib\site-packages\qdrant_client\local\persistence.py", line 119, in persist cursor.execute( sqlite3.DataError: string or blob too big
Which obviously means there are too much data and I am out of SQLite limits.
So, could anyone help me to answer the following questions:
Beta Was this translation helpful? Give feedback.
All reactions