Skip to content

Kept getting NLTK error. #14

@jpamintuan

Description

@jpamintuan

Hi, I ran the create_database.py but I kept getting this error:
`

(ONE_RAG_TEST) ubuntu@ip-~/test_ollama/langchain-rag-tutorial$ python create_database.py
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data] Package punkt is already up-to-date!
Error loading file data/books/alice_in_wonderland.md
Traceback (most recent call last):
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/create_database.py", line 73, in
main()
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/create_database.py", line 27, in main
generate_data_store()
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/create_database.py", line 31, in generate_data_store
documents = load_documents()
^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/create_database.py", line 38, in load_documents
documents = loader.load()
^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/langchain_community/document_loaders/directory.py", line 117, in load
return list(self.lazy_load())
^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/langchain_community/document_loaders/directory.py", line 182, in lazy_load
yield from self._lazy_load_file(i, p, pbar)
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/langchain_community/document_loaders/directory.py", line 220, in _lazy_load_file
raise e
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/langchain_community/document_loaders/directory.py", line 210, in _lazy_load_file
for subdoc in loader.lazy_load():
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/langchain_community/document_loaders/unstructured.py", line 88, in lazy_load
elements = self._get_elements()
^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/langchain_community/document_loaders/unstructured.py", line 180, in _get_elements
return partition(filename=self.file_path, **self.unstructured_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/partition/auto.py", line 415, in partition
elements = _partition_md(
^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/documents/elements.py", line 591, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/file_utils/filetype.py", line 618, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/file_utils/filetype.py", line 582, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/chunking/dispatch.py", line 74, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/partition/md.py", line 112, in partition_md
return partition_html(
^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/documents/elements.py", line 591, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/file_utils/filetype.py", line 618, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/file_utils/filetype.py", line 582, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/chunking/dispatch.py", line 74, in wrapper
elements = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/partition/html.py", line 149, in partition_html
document_to_element_list(
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/partition/common.py", line 559, in document_to_element_list
num_pages = len(document.pages)
^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/documents/xml.py", line 54, in pages
self._pages = self._parse_pages_from_element_tree()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/documents/html.py", line 173, in _parse_pages_from_element_tree
_page_elements, descendanttag_elems = _process_text_tag(tag_elem)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/documents/html.py", line 630, in _process_text_tag
element = _parse_tag(tag_elem, include_tail_text)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/documents/html.py", line 438, in _parse_tag
return _text_to_element(
^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/documents/html.py", line 486, in _text_to_element
elif is_narrative_tag(text, tag):
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/documents/html.py", line 536, in is_narrative_tag
return tag not in HEADING_TAGS and is_possible_narrative_text(text)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/partition/text_type.py", line 80, in is_possible_narrative_text
if exceeds_cap_ratio(text, threshold=cap_threshold):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/partition/text_type.py", line 276, in exceeds_cap_ratio
if sentence_count(text, 3) > 1:
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/partition/text_type.py", line 225, in sentence_count
sentences = sent_tokenize(text)
^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/unstructured/nlp/tokenize.py", line 30, in sent_tokenize
return _sent_tokenize(text)
^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/nltk/tokenize/init.py", line 119, in sent_tokenize
tokenizer = _get_punkt_tokenizer(language)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/nltk/tokenize/init.py", line 105, in _get_punkt_tokenizer
return PunktTokenizer(language)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/nltk/tokenize/punkt.py", line 1744, in init
self.load_lang(lang)
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/nltk/tokenize/punkt.py", line 1749, in load_lang
lang_dir = find(f"tokenizers/punkt_tab/{lang}/")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/python3.12/site-packages/nltk/data.py", line 579, in find
raise LookupError(resource_not_found)
LookupError:


Resource punkt_tab not found.
Please use the NLTK Downloader to obtain the resource:

import nltk
nltk.download('punkt_tab')

For more information see: https://www.nltk.org/data.html

Attempted to load tokenizers/punkt_tab/english/

Searched in:
- '/home/ubuntu/nltk_data'
- '/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/nltk_data'
- '/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/share/nltk_data'
- '/home/ubuntu/test_ollama/langchain-rag-tutorial/ONE_RAG_TEST/lib/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'


(ONE_RAG_TEST) ubuntu:~/test_ollama/langchain-rag-tutorial$

`

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions