Skip to content

Commit 106429f

Browse files
authored
pkg_tar: Fix #948: Don't add duplicate directory members to tar file when a symlink with the same path exists (#949)
Fixes #948. I've modified the deduplication logic to account for the fact that directories must not have the same path as other types of files. Previously, the paths `/lib` and `/lib/` would be treated differently. Two test cases are included: one for symlinks preceding directories, and one for directories preceding symlinks. In both cases, the first member is preserved and the second member with the same path is discarded as a duplicate.
2 parents e708301 + 84dbe44 commit 106429f

File tree

2 files changed

+66
-18
lines changed

2 files changed

+66
-18
lines changed

pkg/private/tar/tar_writer.py

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,18 @@
3535

3636
_DEBUG_VERBOSITY = 0
3737

38+
TARFILE_MEMBER_TYPE_TO_STR = {
39+
b"0": "REGTYPE",
40+
b"\0": "AREGTYPE",
41+
b"1": "LNKTYPE",
42+
b"2": "SYMTYPE",
43+
b"3": "CHRTYPE",
44+
b"4": "BLKTYPE",
45+
b"5": "DIRTYPE",
46+
b"6": "FIFOTYPE",
47+
b"7": "CONTTYPE",
48+
}
49+
3850

3951
class TarFileWriter(object):
4052
"""A wrapper to write tar files."""
@@ -103,13 +115,7 @@ def __init__(self,
103115

104116
self.tar = tarfile.open(name=name, mode=mode, fileobj=self.fileobj,
105117
format=tarfile.GNU_FORMAT)
106-
self.members = set()
107-
self.directories = set()
108-
# Preseed the added directory list with things we should not add. If we
109-
# some day need to allow '.' or '/' as an explicit member of the archive,
110-
# we can adjust that here based on the setting of root_directory.
111-
self.directories.add('/')
112-
self.directories.add('./')
118+
self.existing_members = {}
113119
self.create_parents = create_parents
114120
self.allow_dups_from_deps = allow_dups_from_deps
115121

@@ -119,28 +125,43 @@ def __enter__(self):
119125
def __exit__(self, t, v, traceback):
120126
self.close()
121127

122-
def _have_added(self, path):
123-
"""Have we added this file before."""
124-
return (path in self.members) or (path in self.directories)
128+
def _existing_member_type(self, path):
129+
"""Retrieve an existing tar file member's type if we have added it previously,
130+
return None otherwise."""
131+
# Things we should not add.
132+
# If we some day need to allow '.' or '/' as an explicit member of the archive,
133+
# we can adjust that here based on the setting of root_directory.
134+
if path == '/' or path == './':
135+
return tarfile.DIRTYPE
136+
137+
normalized_path = path.rstrip("/")
138+
return self.existing_members.get(normalized_path, None)
125139

126140
def _addfile(self, info, fileobj=None):
127141
"""Add a file in the tar file if there is no conflict."""
128142
if info.type == tarfile.DIRTYPE:
129-
# Enforce the ending / for directories so we correctly deduplicate.
143+
# Enforce the ending / for directories.
130144
if not info.name.endswith('/'):
131145
info.name += '/'
132-
if not self.allow_dups_from_deps and self._have_added(info.name):
146+
existing_member_type = self._existing_member_type(info.name)
147+
if not self.allow_dups_from_deps and existing_member_type is not None:
133148
# Directories with different contents should get merged without warnings.
134149
# If they have overlapping content, the warning will be on their duplicate *files* instead
135150
if info.type != tarfile.DIRTYPE:
136151
print('Duplicate file in archive: %s, '
137152
'picking first occurrence' % info.name)
153+
# Directories that shadow
154+
elif existing_member_type != tarfile.DIRTYPE and existing_member_type != tarfile.SYMTYPE:
155+
print('Directory shadows a member of type %s in archive: %s, '
156+
'picking first occurrence' % (TARFILE_MEMBER_TYPE_TO_STR.get(
157+
existing_member_type, "UNKNOWN"), info.name))
158+
138159
return
139160

140161
self.tar.addfile(info, fileobj)
141-
self.members.add(info.name)
142-
if info.type == tarfile.DIRTYPE:
143-
self.directories.add(info.name)
162+
# Strip the trailing slash from the path so that we can detect when, for example, we are
163+
# trying to overwrite a symbolic link with a directory.
164+
self.existing_members[info.name.rstrip("/")] = info.type
144165

145166
def add_directory_path(self,
146167
path,
@@ -182,7 +203,8 @@ def conditionally_add_parents(self, path, uid=0, gid=0, uname='', gname='', mtim
182203
for next_level in dirs[0:-1]:
183204
parent_path = parent_path + next_level + '/'
184205

185-
if self.create_parents and not self._have_added(parent_path):
206+
if self.create_parents and self._existing_member_type(
207+
parent_path) is None:
186208
self.add_directory_path(
187209
parent_path,
188210
uid=uid,
@@ -224,7 +246,8 @@ def add_file(self,
224246
return
225247
if name == '.':
226248
return
227-
if not self.allow_dups_from_deps and name in self.members:
249+
if not self.allow_dups_from_deps and self._existing_member_type(
250+
name) is not None:
228251
return
229252

230253
if mtime is None:

tests/tar/tar_writer_test.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def testPreserveTarMtimesTrueByDefault(self):
162162
"rules_pkg/tests/testdata/tar_test.tar")
163163
f.add_tar(input_tar_path)
164164
input_tar = tarfile.open(input_tar_path, "r")
165-
for file_name in f.members:
165+
for file_name in f.existing_members.keys():
166166
input_file = input_tar.getmember(file_name)
167167
output_file = f.tar.getmember(file_name)
168168
self.assertEqual(input_file.mtime, output_file.mtime)
@@ -300,6 +300,31 @@ def testOnlyIntermediateParentsInferred(self):
300300

301301
self.assertTarFileContent(self.tempfile, expected_content)
302302

303+
def testDirectoryDoesNotShadowSymlink(self):
304+
with tar_writer.TarFileWriter(self.tempfile, create_parents=True, allow_dups_from_deps=False) as f:
305+
f.add_file("target_dir", tarfile.DIRTYPE)
306+
f.add_file("symlink", tarfile.SYMTYPE, link="target_dir")
307+
f.add_file("symlink", tarfile.DIRTYPE)
308+
f.add_file('symlink/a', content="q")
309+
content = [
310+
{"name": "target_dir", "type": tarfile.DIRTYPE},
311+
{"name": "symlink", "type": tarfile.SYMTYPE},
312+
{"name": "symlink/a", "type": tarfile.REGTYPE},
313+
]
314+
self.assertTarFileContent(self.tempfile, content)
315+
316+
def testSymlinkDoesNotShadowDirectory(self):
317+
with tar_writer.TarFileWriter(self.tempfile, create_parents=True, allow_dups_from_deps=False) as f:
318+
f.add_file("target_dir", tarfile.DIRTYPE)
319+
f.add_file("not_a_symlink", tarfile.DIRTYPE)
320+
f.add_file("not_a_symlink", tarfile.SYMTYPE, link="target_dir")
321+
f.add_file('not_a_symlink/a', content="q")
322+
content = [
323+
{"name": "target_dir", "type": tarfile.DIRTYPE},
324+
{"name": "not_a_symlink", "type": tarfile.DIRTYPE},
325+
{"name": "not_a_symlink/a", "type": tarfile.REGTYPE},
326+
]
327+
self.assertTarFileContent(self.tempfile, content)
303328

304329

305330
if __name__ == "__main__":

0 commit comments

Comments
 (0)