diff --git a/.gitignore b/.gitignore index 516d4560..80ef73e4 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ dedoc/version.py # Distribution / packaging .Python +etc/ env/ build/ develop-eggs/ diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py index af9554ec..5a017c05 100644 --- a/dedoc/readers/archive_reader/archive_reader.py +++ b/dedoc/readers/archive_reader/archive_reader.py @@ -85,14 +85,19 @@ def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis) def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: - import py7zlib - - with open(path, "rb") as content: - arch_file = py7zlib.Archive7z(content) - names = arch_file.getnames() - for name in names: - file = arch_file.getmember(name) - yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis) + import os + import py7zr + import tempfile + + with tempfile.TemporaryDirectory() as tmpdir: + with py7zr.SevenZipFile(path, "r") as arch_file: + arch_file.extractall(tmpdir) + + for dir_path, _, file_names in os.walk(tmpdir): + for file_name in file_names: + file_path = os.path.join(dir_path, file_name) + with open(file_path, "rb") as file: + yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=file_name, file=file, need_content_analysis=need_content_analysis) def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile: import os diff --git a/requirements.txt b/requirements.txt index 995865df..9a606e10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c' pdfminer.six>=20211012,<=20231228 piexif==1.1.3 puremagic>=1.0,<2.0 # needs libmagic to be installed in the system -pylzma==0.5.0 +py7zr~=1.0 pypdf>=4.0,<6.0 pytesseract==0.3.10 python-docx==0.8.11 diff --git a/tests/data/with_attachments/attachments.7z b/tests/data/with_attachments/attachments.7z deleted file mode 100644 index 505c38da..00000000 Binary files a/tests/data/with_attachments/attachments.7z and /dev/null differ diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index c0b69968..f40a1753 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -98,7 +98,7 @@ def test_archive_with_slash(self) -> None: Tests attachment extraction from archives with files containing slash symbol in the name """ file_name_template = "attachments.{}" - for extension in "7z", "tar", "tar.gz", "zip": + for extension in "tar", "tar.gz", "zip": file_name = file_name_template.format(extension) files = self.__get_list_of_files_in_archive(file_name) self.assertEqual(2, len(files))