Read ChannelManager even if we have no-peer post-update actions

TheBlueMatt · TheBlueMatt · commit fe127ccdc035 · 2025-05-23T13:48:52.000Z
In 93b4479 we fixed an issue which could cause a `ChannelMonitorUpdate` to get marked as blocked on itself, leading to an eventual force-closure. One potential side-effect of that issue, however, is that any further `ChannelMonitorUpdate`s to the same channel while it is blocked will not have any post-update actions processed (as there is a pending blocked `ChannelMonitorUpdate` sitting in the channel). This can leave a dangling `MonitorUpdateCompletionAction` sitting around even after the channel is closed. In 0.1, because `ChannelMonitorUpdate`s to closed channels were finally fully tracked, we started enforcing that any post-update completion action we had on startup corresponded to a peer entry, while at the same time no longer creating peer entries just because we had serialized one in the data we were loading (only creating them if we had channel(s) or a `ChannelMonitor`). This can cause some `ChannelManager` to no longer deserialize on 0.1 as we might have a left-over dangling `MonitorUpdateCompletionAction` and will no longer always have a peer entry just because of it. Here we fix this issue by specifically checking for dangling `MonitorUpdateCompletionAction::PaymentClaim` entries and dropping them if there is no corresponding channel or peer state entry. We only check for `PaymentClaimed` actions rather than allowing for any dangling actions as 93b4479 was only triggerable with MPP claims, so dangling `MonitorUpdateCompletionAction`s for forwarded payments should be exceedingly rare. This also adds an upgrade test to test a slightly convoluted version of this scenario. Trivial conflicts addressed in: * lightning/src/ln/channelmanager.rs
diff --git a/lightning-tests/Cargo.toml b/lightning-tests/Cargo.toml
@@ -15,6 +15,7 @@ lightning-invoice = { path = "../lightning-invoice", default-features = false }
 lightning-macros = { path = "../lightning-macros" }
 lightning = { path = "../lightning", features = ["_test_utils"] }
 lightning_0_1 = { package = "lightning", version = "0.1.1", features = ["_test_utils"] }
+lightning_0_0_125 = { package = "lightning", version = "0.0.125", features = ["_test_utils"] }
 
 bitcoin = { version = "0.32.2", default-features = false }
 
diff --git a/lightning-tests/src/upgrade_downgrade_tests.rs b/lightning-tests/src/upgrade_downgrade_tests.rs
@@ -12,7 +12,21 @@
 
 use lightning_0_1::get_monitor as get_monitor_0_1;
 use lightning_0_1::ln::functional_test_utils as lightning_0_1_utils;
-use lightning_0_1::util::ser::Writeable;
+use lightning_0_1::util::ser::Writeable as _;
+
+use lightning_0_0_125::chain::ChannelMonitorUpdateStatus as ChannelMonitorUpdateStatus_0_0_125;
+use lightning_0_0_125::check_added_monitors as check_added_monitors_0_0_125;
+use lightning_0_0_125::events::ClosureReason as ClosureReason_0_0_125;
+use lightning_0_0_125::expect_payment_claimed as expect_payment_claimed_0_0_125;
+use lightning_0_0_125::get_htlc_update_msgs as get_htlc_update_msgs_0_0_125;
+use lightning_0_0_125::get_monitor as get_monitor_0_0_125;
+use lightning_0_0_125::get_revoke_commit_msgs as get_revoke_commit_msgs_0_0_125;
+use lightning_0_0_125::ln::channelmanager::PaymentId as PaymentId_0_0_125;
+use lightning_0_0_125::ln::channelmanager::RecipientOnionFields as RecipientOnionFields_0_0_125;
+use lightning_0_0_125::ln::functional_test_utils as lightning_0_0_125_utils;
+use lightning_0_0_125::ln::msgs::ChannelMessageHandler as _;
+use lightning_0_0_125::routing::router as router_0_0_125;
+use lightning_0_0_125::util::ser::Writeable as _;
 
 use lightning::ln::functional_test_utils::*;
 
@@ -63,3 +77,139 @@ fn simple_upgrade() {
 
 	claim_payment(&nodes[0], &[&nodes[1]], preimage);
 }
+
+#[test]
+fn test_125_dangling_post_update_actions() {
+	// Tests a failure of upgrading from 0.0.125 to 0.1 when there's a dangling
+	// `MonitorUpdateCompletionAction` due to the bug fixed in
+	// 93b4479e472e6767af5df90fecdcdfb79074e260.
+	let (node_d_ser, mon_ser);
+	{
+		// First, we get RAA-source monitor updates held by using async persistence (note that this
+		// issue was first identified as a consequence of the bug fixed in
+		// 93b4479e472e6767af5df90fecdcdfb79074e260 but in order to replicate that bug we need a
+		// complicated multi-threaded race that is not deterministic, thus we "cheat" here by using
+		// async persistence). We do this by simply claiming an MPP payment and not completing the
+		// second channel's `ChannelMonitorUpdate`, blocking RAA `ChannelMonitorUpdate`s from the
+		// first (which is ultimately a very similar bug to the one fixed in 93b4479e472e6767af5df).
+		//
+		// Then, we claim a second payment on the channel, which ultimately doesn't have its
+		// `ChannelMonitorUpdate` completion handled due to the presence of the blocked
+		// `ChannelMonitorUpdate`. The claim also generates a post-update completion action, but
+		// the `ChannelMonitorUpdate` isn't queued due to the RAA-update block.
+		let chanmon_cfgs = lightning_0_0_125_utils::create_chanmon_cfgs(4);
+		let node_cfgs = lightning_0_0_125_utils::create_node_cfgs(4, &chanmon_cfgs);
+		let node_chanmgrs =
+			lightning_0_0_125_utils::create_node_chanmgrs(4, &node_cfgs, &[None, None, None, None]);
+		let nodes = lightning_0_0_125_utils::create_network(4, &node_cfgs, &node_chanmgrs);
+
+		let node_b_id = nodes[1].node.get_our_node_id();
+		let node_d_id = nodes[3].node.get_our_node_id();
+
+		lightning_0_0_125_utils::create_announced_chan_between_nodes_with_value(
+			&nodes, 0, 1, 100_000, 0,
+		);
+		lightning_0_0_125_utils::create_announced_chan_between_nodes_with_value(
+			&nodes, 0, 2, 100_000, 0,
+		);
+		let chan_id_1_3 = lightning_0_0_125_utils::create_announced_chan_between_nodes_with_value(
+			&nodes, 1, 3, 100_000, 0,
+		)
+		.2;
+		let chan_id_2_3 = lightning_0_0_125_utils::create_announced_chan_between_nodes_with_value(
+			&nodes, 2, 3, 100_000, 0,
+		)
+		.2;
+
+		let (preimage, hash, secret) =
+			lightning_0_0_125_utils::get_payment_preimage_hash(&nodes[3], Some(15_000_000), None);
+
+		let pay_params = router_0_0_125::PaymentParameters::from_node_id(
+			node_d_id,
+			lightning_0_0_125_utils::TEST_FINAL_CLTV,
+		)
+		.with_bolt11_features(nodes[3].node.bolt11_invoice_features())
+		.unwrap();
+
+		let route_params =
+			router_0_0_125::RouteParameters::from_payment_params_and_value(pay_params, 15_000_000);
+		let route = lightning_0_0_125_utils::get_route(&nodes[0], &route_params).unwrap();
+
+		let onion = RecipientOnionFields_0_0_125::secret_only(secret);
+		let id = PaymentId_0_0_125(hash.0);
+		nodes[0].node.send_payment_with_route(route, hash, onion, id).unwrap();
+
+		check_added_monitors_0_0_125!(nodes[0], 2);
+		let paths = &[&[&nodes[1], &nodes[3]][..], &[&nodes[2], &nodes[3]]];
+		lightning_0_0_125_utils::pass_along_route(&nodes[0], paths, 15_000_000, hash, secret);
+
+		let preimage_2 = lightning_0_0_125_utils::route_payment(&nodes[1], &[&nodes[3]], 100_000).0;
+
+		chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus_0_0_125::InProgress);
+		chanmon_cfgs[3].persister.set_update_ret(ChannelMonitorUpdateStatus_0_0_125::InProgress);
+		nodes[3].node.claim_funds(preimage);
+		check_added_monitors_0_0_125!(nodes[3], 2);
+
+		let (outpoint, update_id, _) = {
+			let latest_monitors = nodes[3].chain_monitor.latest_monitor_update_id.lock().unwrap();
+			latest_monitors.get(&chan_id_1_3).unwrap().clone()
+		};
+		nodes[3].chain_monitor.chain_monitor.channel_monitor_updated(outpoint, update_id).unwrap();
+		expect_payment_claimed_0_0_125!(nodes[3], hash, 15_000_000);
+
+		let ds_fulfill = get_htlc_update_msgs_0_0_125!(nodes[3], node_b_id);
+		// Due to an unrelated test bug in 0.0.125, we have to leave the `ChannelMonitorUpdate` for
+		// the previous node un-completed or we will panic when dropping the `Node`.
+		chanmon_cfgs[1].persister.set_update_ret(ChannelMonitorUpdateStatus_0_0_125::InProgress);
+		nodes[1].node.handle_update_fulfill_htlc(&node_d_id, &ds_fulfill.update_fulfill_htlcs[0]);
+		check_added_monitors_0_0_125!(nodes[1], 1);
+
+		nodes[1].node.handle_commitment_signed(&node_d_id, &ds_fulfill.commitment_signed);
+		check_added_monitors_0_0_125!(nodes[1], 1);
+
+		// The `ChannelMonitorUpdate` generated by the RAA from node B to node D will be blocked.
+		let (bs_raa, _) = get_revoke_commit_msgs_0_0_125!(nodes[1], node_d_id);
+		nodes[3].node.handle_revoke_and_ack(&node_b_id, &bs_raa);
+		check_added_monitors_0_0_125!(nodes[3], 0);
+
+		// Now that there is a blocked update in the B <-> D channel, we can claim the second
+		// payment across it, which, while it will generate a `ChannelMonitorUpdate`, will not
+		// complete its post-update actions.
+		nodes[3].node.claim_funds(preimage_2);
+		check_added_monitors_0_0_125!(nodes[3], 1);
+
+		// Finally, we set up the failure by force-closing the channel in question, ensuring that
+		// 0.1 will not create a per-peer state for node B.
+		let err = "Force Closing Channel".to_owned();
+		nodes[3].node.force_close_without_broadcasting_txn(&chan_id_1_3, &node_b_id, err).unwrap();
+		let reason =
+			ClosureReason_0_0_125::HolderForceClosed { broadcasted_latest_txn: Some(false) };
+		let peers = &[node_b_id];
+		lightning_0_0_125_utils::check_closed_event(&nodes[3], 1, reason, false, peers, 100_000);
+		lightning_0_0_125_utils::check_closed_broadcast(&nodes[3], 1, true);
+		check_added_monitors_0_0_125!(nodes[3], 1);
+
+		node_d_ser = nodes[3].node.encode();
+		mon_ser = get_monitor_0_0_125!(nodes[3], chan_id_2_3).encode();
+	}
+
+	// Create a dummy node to reload over with the 0.0.125 state
+
+	let mut chanmon_cfgs = create_chanmon_cfgs(4);
+
+	// Our TestChannelSigner will fail as we're jumping ahead, so disable its state-based checks
+	chanmon_cfgs[0].keys_manager.disable_all_state_policy_checks = true;
+	chanmon_cfgs[1].keys_manager.disable_all_state_policy_checks = true;
+	chanmon_cfgs[2].keys_manager.disable_all_state_policy_checks = true;
+	chanmon_cfgs[3].keys_manager.disable_all_state_policy_checks = true;
+
+	let node_cfgs = create_node_cfgs(4, &chanmon_cfgs);
+	let (persister, chain_mon);
+	let node_chanmgrs = create_node_chanmgrs(4, &node_cfgs, &[None, None, None, None]);
+	let node;
+	let mut nodes = create_network(4, &node_cfgs, &node_chanmgrs);
+
+	// Finally, reload the node in the latest LDK. This previously failed.
+	let config = test_default_channel_config();
+	reload_node!(nodes[3], config, &node_d_ser, &[&mon_ser], persister, chain_mon, node);
+}
diff --git a/lightning/src/ln/channelmanager.rs b/lightning/src/ln/channelmanager.rs
@@ -13582,9 +13582,17 @@ where
 			 $monitor: expr, $peer_state: expr, $logger: expr, $channel_info_log: expr
 			) => { {
 				let mut max_in_flight_update_id = 0;
+				let starting_len =  $chan_in_flight_upds.len();
 				$chan_in_flight_upds.retain(|upd| upd.update_id > $monitor.get_latest_update_id());
+				if $chan_in_flight_upds.len() < starting_len {
+					log_debug!(
+						$logger,
+						"{} ChannelMonitorUpdates completed after ChannelManager was last serialized",
+						starting_len - $chan_in_flight_upds.len()
+					);
+				}
 				for update in $chan_in_flight_upds.iter() {
-					log_trace!($logger, "Replaying ChannelMonitorUpdate {} for {}channel {}",
+					log_debug!($logger, "Replaying ChannelMonitorUpdate {} for {}channel {}",
 						update.update_id, $channel_info_log, &$monitor.channel_id());
 					max_in_flight_update_id = cmp::max(max_in_flight_update_id, update.update_id);
 					pending_background_events.push(
@@ -14148,11 +14156,31 @@ where
 							debug_assert!(false, "Non-event-generating channel freeing should not appear in our queue");
 						}
 					}
+					// Note that we may have a post-update action for a channel that has no pending
+					// `ChannelMonitorUpdate`s, but unlike the no-peer-state case, it may simply be
+					// because we had a `ChannelMonitorUpdate` complete after the last time this
+					// `ChannelManager` was serialized. In that case, we'll run the post-update
+					// actions as soon as we get going.
 				}
 				peer_state.lock().unwrap().monitor_update_blocked_actions = monitor_update_blocked_actions;
 			} else {
-				log_error!(WithContext::from(&args.logger, Some(node_id), None, None), "Got blocked actions without a per-peer-state for {}", node_id);
-				return Err(DecodeError::InvalidValue);
+				for actions in monitor_update_blocked_actions.values() {
+					for action in actions.iter() {
+						if matches!(action, MonitorUpdateCompletionAction::PaymentClaimed { .. }) {
+							// If there are no state for this channel but we have pending
+							// post-update actions, its possible that one was left over from pre-0.1
+							// payment claims where MPP claims led to a channel blocked on itself
+							// and later `ChannelMonitorUpdate`s didn't get their post-update
+							// actions run.
+							// This should only have happened for `PaymentClaimed` post-update actions,
+							// which we ignore here.
+						} else {
+							let logger = WithContext::from(&args.logger, Some(node_id), None, None);
+							log_error!(logger, "Got blocked actions {:?} without a per-peer-state for {}", monitor_update_blocked_actions, node_id);
+							return Err(DecodeError::InvalidValue);
+						}
+					}
+				}
 			}
 		}