From 0ac8c37177fc8c92938ad3cc35cc9aebb53e7e0f Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 20 Nov 2025 16:17:45 +0300 Subject: [PATCH 1/2] TLDR-922: replace pylzma by py7zr --- .gitignore | 1 + .../readers/archive_reader/archive_reader.py | 21 ++++++++++++------- requirements.txt | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 516d4560..80ef73e4 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ dedoc/version.py # Distribution / packaging .Python +etc/ env/ build/ develop-eggs/ diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py index af9554ec..5a017c05 100644 --- a/dedoc/readers/archive_reader/archive_reader.py +++ b/dedoc/readers/archive_reader/archive_reader.py @@ -85,14 +85,19 @@ def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis) def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]: - import py7zlib - - with open(path, "rb") as content: - arch_file = py7zlib.Archive7z(content) - names = arch_file.getnames() - for name in names: - file = arch_file.getmember(name) - yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis) + import os + import py7zr + import tempfile + + with tempfile.TemporaryDirectory() as tmpdir: + with py7zr.SevenZipFile(path, "r") as arch_file: + arch_file.extractall(tmpdir) + + for dir_path, _, file_names in os.walk(tmpdir): + for file_name in file_names: + file_path = os.path.join(dir_path, file_name) + with open(file_path, "rb") as file: + yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=file_name, file=file, need_content_analysis=need_content_analysis) def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile: import os diff --git a/requirements.txt b/requirements.txt index 995865df..9a606e10 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c' pdfminer.six>=20211012,<=20231228 piexif==1.1.3 puremagic>=1.0,<2.0 # needs libmagic to be installed in the system -pylzma==0.5.0 +py7zr~=1.0 pypdf>=4.0,<6.0 pytesseract==0.3.10 python-docx==0.8.11 From 8dd38a256897da8eba2b41410e6dcd7c24c76e2b Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 20 Nov 2025 16:38:19 +0300 Subject: [PATCH 2/2] TLDR-922: fix tests --- tests/data/with_attachments/attachments.7z | Bin 242 -> 0 bytes .../test_module_attachment_extractor.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 tests/data/with_attachments/attachments.7z diff --git a/tests/data/with_attachments/attachments.7z b/tests/data/with_attachments/attachments.7z deleted file mode 100644 index 505c38dab416043e3aa4a7709879e72ee8431865..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 242 zcmXr7+Ou9=hJi(k@6fvq3=p6MrN6RwW-~G<8|oP87;_oxnCKXBrDYZsmnbOYWag!E z6{jZW=cPc{Kpl<7?Cbb1v>&pWZ}0BN!e7Yq`^AHb@DGgaTMoY#dOy8f_oZ09PsqAe zcX<^OW?kEEu}(+w>ysM4=sj2DoZihy+J9)x5GeLKS$7ywI=SWy4~ diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index c0b69968..f40a1753 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -98,7 +98,7 @@ def test_archive_with_slash(self) -> None: Tests attachment extraction from archives with files containing slash symbol in the name """ file_name_template = "attachments.{}" - for extension in "7z", "tar", "tar.gz", "zip": + for extension in "tar", "tar.gz", "zip": file_name = file_name_template.format(extension) files = self.__get_list_of_files_in_archive(file_name) self.assertEqual(2, len(files))