From 41aac8e782fdd9e2a19c6fadd27807782fc36203 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Wed, 23 Apr 2025 11:01:31 -0400
Subject: [PATCH 1/3] test-tool: add pack-deltas helper

When trying to demonstrate certain behavior in tests, it can be helpful
to create packfiles that have specific delta structures. 'git
pack-objects' uses various algorithms to select deltas based on their
compression rates, but that does not always demonstrate all possible
packfile shapes. This becomes especially important when wanting to test
'git index-pack' and its ability to parse certain pack shapes.

We have prior art in t/lib-pack.sh, where certain delta structures are
produced by manually writing certain opaque pack contents. However,
producing these script updates is cumbersome and difficult to do as a
contributor.

Instead, create a new test-tool, 'test-tool pack-deltas', that reads a
list of instructions for which objects to include in a packfile and how
those objects should be written in delta form.

At the moment, this only supports REF_DELTAs as those are the kinds of
deltas needed to exercise a bug in 'git index-pack'.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Makefile                    |   1 +
 t/helper/meson.build        |   1 +
 t/helper/test-pack-deltas.c | 148 ++++++++++++++++++++++++++++++++++++
 t/helper/test-tool.c        |   1 +
 t/helper/test-tool.h        |   1 +
 5 files changed, 152 insertions(+)
 create mode 100644 t/helper/test-pack-deltas.c

diff --git a/Makefile b/Makefile
index 13f9062a056944..c4d21ccd3d1b6b 100644
--- a/Makefile
+++ b/Makefile
@@ -821,6 +821,7 @@ TEST_BUILTINS_OBJS += test-mergesort.o
 TEST_BUILTINS_OBJS += test-mktemp.o
 TEST_BUILTINS_OBJS += test-name-hash.o
 TEST_BUILTINS_OBJS += test-online-cpus.o
+TEST_BUILTINS_OBJS += test-pack-deltas.o
 TEST_BUILTINS_OBJS += test-pack-mtimes.o
 TEST_BUILTINS_OBJS += test-parse-options.o
 TEST_BUILTINS_OBJS += test-parse-pathspec-file.o
diff --git a/t/helper/meson.build b/t/helper/meson.build
index d2cabaa2bcfcc9..d4e8b26df8d6de 100644
--- a/t/helper/meson.build
+++ b/t/helper/meson.build
@@ -36,6 +36,7 @@ test_tool_sources = [
   'test-mktemp.c',
   'test-name-hash.c',
   'test-online-cpus.c',
+  'test-pack-deltas.c',
   'test-pack-mtimes.c',
   'test-parse-options.c',
   'test-parse-pathspec-file.c',
diff --git a/t/helper/test-pack-deltas.c b/t/helper/test-pack-deltas.c
new file mode 100644
index 00000000000000..4caa024b1ebe73
--- /dev/null
+++ b/t/helper/test-pack-deltas.c
@@ -0,0 +1,148 @@
+#define USE_THE_REPOSITORY_VARIABLE
+
+#include "test-tool.h"
+#include "git-compat-util.h"
+#include "delta.h"
+#include "git-zlib.h"
+#include "hash.h"
+#include "hex.h"
+#include "pack.h"
+#include "pack-objects.h"
+#include "parse-options.h"
+#include "setup.h"
+#include "strbuf.h"
+#include "string-list.h"
+
+static const char *usage_str[] = {
+	"test-tool pack-deltas --num-objects <num-objects>",
+	NULL
+};
+
+static unsigned long do_compress(void **pptr, unsigned long size)
+{
+	git_zstream stream;
+	void *in, *out;
+	unsigned long maxsize;
+
+	git_deflate_init(&stream, 1);
+	maxsize = git_deflate_bound(&stream, size);
+
+	in = *pptr;
+	out = xmalloc(maxsize);
+	*pptr = out;
+
+	stream.next_in = in;
+	stream.avail_in = size;
+	stream.next_out = out;
+	stream.avail_out = maxsize;
+	while (git_deflate(&stream, Z_FINISH) == Z_OK)
+		; /* nothing */
+	git_deflate_end(&stream);
+
+	free(in);
+	return stream.total_out;
+}
+
+static void write_ref_delta(struct hashfile *f,
+			    struct object_id *oid,
+			    struct object_id *base)
+{
+	unsigned char header[MAX_PACK_OBJECT_HEADER];
+	unsigned long size, base_size, delta_size, compressed_size, hdrlen;
+	enum object_type type;
+	void *base_buf, *delta_buf;
+	void *buf = repo_read_object_file(the_repository,
+					  oid, &type,
+					  &size);
+
+	if (!buf)
+		die("unable to read %s", oid_to_hex(oid));
+
+	base_buf = repo_read_object_file(the_repository,
+					 base, &type,
+					 &base_size);
+
+	if (!base_buf)
+		die("unable to read %s", oid_to_hex(base));
+
+	delta_buf = diff_delta(base_buf, base_size,
+			       buf, size, &delta_size, 0);
+
+	compressed_size = do_compress(&delta_buf, delta_size);
+
+	hdrlen = encode_in_pack_object_header(header, sizeof(header),
+					      OBJ_REF_DELTA, delta_size);
+	hashwrite(f, header, hdrlen);
+	hashwrite(f, base->hash, the_repository->hash_algo->rawsz);
+	hashwrite(f, delta_buf, compressed_size);
+
+	free(buf);
+	free(base_buf);
+	free(delta_buf);
+}
+
+int cmd__pack_deltas(int argc, const char **argv)
+{
+	int num_objects = -1;
+	struct hashfile *f;
+	struct strbuf line = STRBUF_INIT;
+	struct option options[] = {
+		OPT_INTEGER('n', "num-objects", &num_objects, N_("the number of objects to write")),
+		OPT_END()
+	};
+
+	argc = parse_options(argc, argv, NULL,
+			     options, usage_str, 0);
+
+	if (argc || num_objects < 0)
+		usage_with_options(usage_str, options);
+
+	setup_git_directory();
+
+	f = hashfd(the_repository->hash_algo, 1, "<stdout>");
+	write_pack_header(f, num_objects);
+
+	/* Read each line from stdin into 'line' */
+	while (strbuf_getline_lf(&line, stdin) != EOF) {
+		const char *type_str, *content_oid_str, *base_oid_str = NULL;
+		struct object_id content_oid, base_oid;
+		struct string_list items = STRING_LIST_INIT_NODUP;
+		/*
+		 * Tokenize into two or three parts:
+		 * 1. REF_DELTA, OFS_DELTA, or FULL.
+		 * 2. The object ID for the content object.
+		 * 3. The object ID for the base object (optional).
+		 */
+		if (string_list_split_in_place(&items, line.buf, " ", 3) < 0)
+			die("invalid input format: %s", line.buf);
+
+		if (items.nr < 2)
+			die("invalid input format: %s", line.buf);
+
+		type_str = items.items[0].string;
+		content_oid_str = items.items[1].string;
+
+		if (get_oid_hex(content_oid_str, &content_oid))
+			die("invalid object: %s", content_oid_str);
+		if (items.nr >= 3) {
+			base_oid_str = items.items[2].string;
+			if (get_oid_hex(base_oid_str, &base_oid))
+				die("invalid object: %s", base_oid_str);
+		}
+		string_list_clear(&items, 0);
+
+		if (!strcmp(type_str, "REF_DELTA"))
+			write_ref_delta(f, &content_oid, &base_oid);
+		else if (!strcmp(type_str, "OFS_DELTA"))
+			die("OFS_DELTA not implemented");
+		else if (!strcmp(type_str, "FULL"))
+			die("FULL not implemented");
+		else
+			die("unknown pack type: %s", type_str);
+	}
+
+	finalize_hashfile(f, NULL, FSYNC_COMPONENT_PACK,
+			  CSUM_HASH_IN_STREAM | CSUM_FSYNC | CSUM_CLOSE);
+	strbuf_release(&line);
+	return 0;
+}
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index 50dc4dac4ed625..74812ed86d385a 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -46,6 +46,7 @@ static struct test_cmd cmds[] = {
 	{ "mktemp", cmd__mktemp },
 	{ "name-hash", cmd__name_hash },
 	{ "online-cpus", cmd__online_cpus },
+	{ "pack-deltas", cmd__pack_deltas },
 	{ "pack-mtimes", cmd__pack_mtimes },
 	{ "parse-options", cmd__parse_options },
 	{ "parse-options-flags", cmd__parse_options_flags },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index 6d62a5b53d9596..2571a3ccfe8991 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -39,6 +39,7 @@ int cmd__mergesort(int argc, const char **argv);
 int cmd__mktemp(int argc, const char **argv);
 int cmd__name_hash(int argc, const char **argv);
 int cmd__online_cpus(int argc, const char **argv);
+int cmd__pack_deltas(int argc, const char **argv);
 int cmd__pack_mtimes(int argc, const char **argv);
 int cmd__parse_options(int argc, const char **argv);
 int cmd__parse_options_flags(int argc, const char **argv);

From 53a990e69eaf9564e6bb7a38f9205a17283321de Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Wed, 23 Apr 2025 11:05:48 -0400
Subject: [PATCH 2/3] t5309: create failing test for 'git index-pack'

This new test demonstrates some behavior where a valid packfile is being
rejected by the Git client due to the order in which it is resolving
REF_DELTAs.

The thin packfile has a REF_DELTA chain A->B->C where C is not included
in the packfile. However, the client repository contains both C and B
already. Thus, 'git index-pack' is able to resolve A before resolving B.

When resolving B, it then attempts to resolve any other REF_DELTAs that
are pointing to B as a base. This "revisits" A and complains as if there
is a cycle, but it did not actually detect a cycle.

A fix will arrive in the next change.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 t/t5309-pack-delta-cycles.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/t/t5309-pack-delta-cycles.sh b/t/t5309-pack-delta-cycles.sh
index 60fc710bacb20e..6a9367633026c1 100755
--- a/t/t5309-pack-delta-cycles.sh
+++ b/t/t5309-pack-delta-cycles.sh
@@ -75,4 +75,28 @@ test_expect_success 'failover to a duplicate object in the same pack' '
 	test_must_fail git index-pack --fix-thin --stdin <recoverable.pack
 '
 
+test_expect_failure 'index-pack works with thin pack A->B->C with B on disk' '
+	git init server &&
+	(
+		cd server &&
+		test_commit_bulk 4
+	) &&
+
+	A=$(git -C server rev-parse HEAD^{tree}) &&
+	B=$(git -C server rev-parse HEAD~1^{tree}) &&
+	C=$(git -C server rev-parse HEAD~2^{tree}) &&
+	git -C server reset --hard HEAD~1 &&
+
+	test-tool -C server pack-deltas --num-objects=2 >thin.pack <<-EOF &&
+	REF_DELTA $A $B
+	REF_DELTA $B $C
+	EOF
+
+	git clone "file://$(pwd)/server" client &&
+	(
+		cd client &&
+		git index-pack --fix-thin --stdin <../thin.pack
+	)
+'
+
 test_done

From 1358039b2f3bf893fffc63c1065f1d6862b74957 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Wed, 23 Apr 2025 11:36:10 -0400
Subject: [PATCH 3/3] index-pack: allow revisiting REF_DELTA chains

As detailed in the previous changes to t5309-pack-delta-cycles.sh, the
logic within 'git index-pack' to analyze an incoming thin packfile with
REF_DELTAs is suspect. The algorithm is overly cautious around delta
cycles, and that leads in fact to failing even when there is no cycle.

This change adjusts the algorithm to no longer fail in these cases. In
fact, these cycle cases will no longer fail but more importantly the
valid cases will no longer fail, either. The resulting packfile from the
--fix-thin operation will not have cycles either since REF_DELTAs are
forbidden from the on-disk format and OFS_DELTAs are impossible to write
as a cycle.

The crux of the matter is how the algorithm works when the REF_DELTAs
point to base objects that exist in the local repository. When reading
the thin packfile, the object IDs for the delta objects are unknown so
we do not have the delta chain structure automatically. Instead, we need
to start somewhere by selecting a delta whose base is inside our current
object database.

Consider the case where the packfile has two REF_DELTA objects, A and B,
and the delta chain looks like "A depends on B" and "B depends on C" for
some third object C, where C is already in the current repository. The
algorithm _should_ start with all objects that depend on C, finding B,
and then moving on to all objects depending on B, finding A.

However, if the repository also already has object B, then the delta
chain can be analyzed in a different order. The deltas with base B can
be analyzed first, finding A, and then the deltas with base C are
analyzed, finding B. The algorithm currently continues to look for
objects that depend on B, finding A again. This fails due to A's
'real_type' member already being overwritten from OBJ_REF_DELTA to the
correct object type.

This scenario is possible in a typical 'git fetch' where the client does
not advertise B as a 'have' but requests A as a 'want' (and C is noticed
as a common object based on other 'have's). The reason this isn't
typically seen is that most Git servers use OFS_DELTAs to represent
deltas within a packfile. However, if a server uses only REF_DELTAs,
then this kind of issue can occur. There is nothing in the explicit
packfile format that states this use of inter-pack REF_DELTA is
incorrect, only that REF_DELTAs should not be used in the on-disk
representation to avoid cycles.

This die() was introduced in ab791dd138 (index-pack: fix race condition
with duplicate bases, 2014-08-29). Several refactors have adjusted the
error message and the surrounding logic, but this issue has existed for
a longer time as that was only a conversion from an assert().

The tests in t5309 originated in 3b910d0c5e (add tests for indexing
packs with delta cycles, 2013-08-23) and b2ef3d9ebb (test index-pack on
packs with recoverable delta cycles, 2013-08-23). These changes make
note that the current behavior of handling "resolvable" cycles is mostly
a documentation-only test, not that this behavior is the best way for
Git to handle the situation.

The fix here is somewhat complicated due to the amount of state being
adjusted by the loop within threaded_second_pass(). Instead of trying to
resume the start of the loop while adjusting the necessary context, I
chose to scan the REF_DELTAs depending on the current 'parent' and skip
any that have already been processed. This necessarily leaves us in a
state where 'child' and 'child_obj' could be left as NULL and that must
be handled later. There is also some careful handling around skipping
REF_DELTAs when there are also OFS_DELTAs depending on that parent.
There may be value in extending 'test-tool pack-deltas' to allow writing
OFS_DELTAs in order to exercise this logic across the delta types.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/index-pack.c         | 58 ++++++++++++++++++++----------------
 t/t5309-pack-delta-cycles.sh | 12 ++++++--
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index de127c0ff13a28..dbe79701fb8b6f 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -1109,8 +1109,8 @@ static void *threaded_second_pass(void *data)
 		set_thread_data(data);
 	for (;;) {
 		struct base_data *parent = NULL;
-		struct object_entry *child_obj;
-		struct base_data *child;
+		struct object_entry *child_obj = NULL;
+		struct base_data *child = NULL;
 
 		counter_lock();
 		display_progress(progress, nr_resolved_deltas);
@@ -1137,15 +1137,18 @@ static void *threaded_second_pass(void *data)
 			parent = list_first_entry(&work_head, struct base_data,
 						  list);
 
-			if (parent->ref_first <= parent->ref_last) {
+			while (parent->ref_first <= parent->ref_last) {
 				int offset = ref_deltas[parent->ref_first++].obj_no;
 				child_obj = objects + offset;
-				if (child_obj->real_type != OBJ_REF_DELTA)
-					die("REF_DELTA at offset %"PRIuMAX" already resolved (duplicate base %s?)",
-					    (uintmax_t) child_obj->idx.offset,
-					    oid_to_hex(&parent->obj->idx.oid));
+				if (child_obj->real_type != OBJ_REF_DELTA) {
+					child_obj = NULL;
+					continue;
+				}
 				child_obj->real_type = parent->obj->real_type;
-			} else {
+				break;
+			}
+
+			if (!child_obj && parent->ofs_first <= parent->ofs_last) {
 				child_obj = objects +
 					ofs_deltas[parent->ofs_first++].obj_no;
 				assert(child_obj->real_type == OBJ_OFS_DELTA);
@@ -1178,29 +1181,32 @@ static void *threaded_second_pass(void *data)
 		}
 		work_unlock();
 
-		if (parent) {
-			child = resolve_delta(child_obj, parent);
-			if (!child->children_remaining)
-				FREE_AND_NULL(child->data);
-		} else {
-			child = make_base(child_obj, NULL);
-			if (child->children_remaining) {
-				/*
-				 * Since this child has its own delta children,
-				 * we will need this data in the future.
-				 * Inflate now so that future iterations will
-				 * have access to this object's data while
-				 * outside the work mutex.
-				 */
-				child->data = get_data_from_pack(child_obj);
-				child->size = child_obj->size;
+		if (child_obj) {
+			if (parent) {
+				child = resolve_delta(child_obj, parent);
+				if (!child->children_remaining)
+					FREE_AND_NULL(child->data);
+			} else{
+				child = make_base(child_obj, NULL);
+				if (child->children_remaining) {
+					/*
+					 * Since this child has its own delta children,
+					 * we will need this data in the future.
+					 * Inflate now so that future iterations will
+					 * have access to this object's data while
+					 * outside the work mutex.
+					 */
+					child->data = get_data_from_pack(child_obj);
+					child->size = child_obj->size;
+				}
 			}
 		}
 
 		work_lock();
 		if (parent)
 			parent->retain_data--;
-		if (child->data) {
+
+		if (child && child->data) {
 			/*
 			 * This child has its own children, so add it to
 			 * work_head.
@@ -1209,7 +1215,7 @@ static void *threaded_second_pass(void *data)
 			base_cache_used += child->size;
 			prune_base_data(NULL);
 			free_base_data(child);
-		} else {
+		} else if (child) {
 			/*
 			 * This child does not have its own children. It may be
 			 * the last descendant of its ancestors; free those
diff --git a/t/t5309-pack-delta-cycles.sh b/t/t5309-pack-delta-cycles.sh
index 6a9367633026c1..6b03675d91b5e1 100755
--- a/t/t5309-pack-delta-cycles.sh
+++ b/t/t5309-pack-delta-cycles.sh
@@ -60,7 +60,10 @@ test_expect_success 'index-pack detects REF_DELTA cycles' '
 test_expect_success 'failover to an object in another pack' '
 	clear_packs &&
 	git index-pack --stdin <ab.pack &&
-	test_must_fail git index-pack --stdin --fix-thin <cycle.pack
+
+	# This cycle does not fail since the existence of A & B in
+	# the repo allows us to resolve the cycle.
+	git index-pack --stdin --fix-thin <cycle.pack
 '
 
 test_expect_success 'failover to a duplicate object in the same pack' '
@@ -72,10 +75,13 @@ test_expect_success 'failover to a duplicate object in the same pack' '
 		pack_obj $A
 	} >recoverable.pack &&
 	pack_trailer recoverable.pack &&
-	test_must_fail git index-pack --fix-thin --stdin <recoverable.pack
+
+	# This cycle does not fail since the existence of a full copy
+	# of A in the pack allows us to resolve the cycle.
+	git index-pack --fix-thin --stdin <recoverable.pack
 '
 
-test_expect_failure 'index-pack works with thin pack A->B->C with B on disk' '
+test_expect_success 'index-pack works with thin pack A->B->C with B on disk' '
 	git init server &&
 	(
 		cd server &&