Skip to content

Missing Connection Retry after Connection Loss #35

@severindellsperger

Description

@severindellsperger

We have an HA RabbitMQ cluster, where we testing the HA functionality.
Our application structure is like following:

  • Several consumers, which consume messages from a defined exchange (we have to use a fan out a strategy because there are several identical distributed consumers):
    Consumer Implementation
    async def handle_message(self, message: bytes) -> None:
        # Here I process the messages...
        await get_channel_layer().group_send(
            "websocket", {"type": "websocket.message", "message": json.loads(message)}
        )

    def _random_letters(self, n: int) -> str:
        return "".join(random.choice(string.ascii_letters) for i in range(n))

    async def process_messages(self):
        channel_layer = get_channel_layer()
        carehare_connection = await channel_layer.carehare_connection
        self.queue_name = f"changes_{self._random_letters(12)}"
        await carehare_connection.exchange_declare(
            exchange_name=self.exchange, exchange_type="fanout"
        )
        await carehare_connection.queue_declare(
            queue_name=self.queue_name,
            durable=True,
            arguments={"x-queue-type": "quorum", "x-expires": 5},
        )
        await carehare_connection.queue_bind(
            exchange_name=self.exchange, queue_name=self.queue_name
        )
        self.logger.info(f"Connected to queue {self.queue_name}: ")
        async with carehare_connection.acking_consumer(self.queue_name) as consumer:
            async for message in consumer:
                await self.handle_message(message)

    def handle(self, *args, **options):
        self.exchange = options.get("exchange") or "test"
        asyncio.run(self.process_messages())

  • WebSocket Consumer, which receive messages from the consumers:
  • Django Channels Websocket Consumer Implementation
class UpdateTopologyConsumer(JsonWebsocketConsumer):
    def connect(self):
        async_to_sync(self.channel_layer.group_add)("websocket", self.channel_name)
        self.accept()

    def websocket_message(self, message):
        print(message, flush=True)
        self.send_json(message)

    def disconnect(self, close_code):
        self.close()

Now, we face the problem, that if one RabbitMQ node in the cluster goes down, the application breaks:
Consumer Error

backend> python manage.py listenonupdates changes -v 3
2021-03-12 12:31:19,713 | INFO | Connect to RabbitMQ and subscribe to exchange: changes
2021-03-12 12:31:19,888 | INFO | Connected to queue changes_PHnjRResdlDD: 
2021-03-12 12:33:08,772 | INFO | Message received: 
2021-03-12 12:33:08,772 | INFO | b'{"bla": "bla bla"}'
Disconnected from RabbitMQ: RabbitMQ closed the connection: 320 CONNECTION_FORCED - Node was put into maintenance mode. Will reconnect.
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/channels_rabbitmq/core.py", line 263, in _reconnect_forever
    await connection.closed
carehare._exceptions.ConnectionClosedByServer: RabbitMQ closed the connection: 320 CONNECTION_FORCED - Node was put into maintenance mode
Closing consumer
Traceback (most recent call last):
  File "/usr/src/app/backend/updatetopology/management/commands/listenonupdates.py", line 68, in process_messages
    async for message in consumer:
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 74, in __anext__
    message, self._yielded_delivery_tag = await _next_delivery(
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 50, in _next_delivery
    closed.result()  # raise exception if there is one
  File "/usr/local/lib/python3.9/site-packages/channels_rabbitmq/reader.py", line 37, in consume_into_multi_queue_until_connection_close
    multi_queue.put_nowait(
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 223, in __aexit__
    await self.closed
  File "/usr/local/lib/python3.9/site-packages/channels_rabbitmq/reader.py", line 32, in consume_into_multi_queue_until_connection_close
    body, delivery_tag = await consumer.next_delivery()
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 196, in next_delivery
    return await _next_delivery(self._queue, self.closed)
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 50, in _next_delivery
    closed.result()  # raise exception if there is one
carehare._exceptions.ConnectionClosed
Traceback (most recent call last):
  File "/usr/src/app/backend/manage.py", line 22, in <module>
    main()
  File "/usr/src/app/backend/manage.py", line 18, in main
    execute_from_command_line(sys.argv)
  File "/usr/local/lib/python3.9/site-packages/django/core/management/__init__.py", line 401, in execute_from_command_line
    utility.execute()
  File "/usr/local/lib/python3.9/site-packages/django/core/management/__init__.py", line 395, in execute
    self.fetch_command(subcommand).run_from_argv(self.argv)
  File "/usr/local/lib/python3.9/site-packages/django/core/management/base.py", line 330, in run_from_argv
    self.execute(*args, **cmd_options)
  File "/usr/local/lib/python3.9/site-packages/django/core/management/base.py", line 371, in execute
    output = self.handle(*args, **options)
  File "/usr/src/app/backend/updatetopology/management/commands/listenonupdates.py", line 77, in handle
    asyncio.run(self.process_messages())
  File "/usr/local/lib/python3.9/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/usr/local/lib/python3.9/asyncio/base_events.py", line 642, in run_until_complete
    return future.result()
  File "/usr/src/app/backend/updatetopology/management/commands/listenonupdates.py", line 69, in process_messages
    await self.handle_message(message)
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 223, in __aexit__
    await self.closed
  File "/usr/src/app/backend/updatetopology/management/commands/listenonupdates.py", line 68, in process_messages
    async for message in consumer:
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 74, in __anext__
    message, self._yielded_delivery_tag = await _next_delivery(
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 50, in _next_delivery
    closed.result()  # raise exception if there is one
  File "/usr/local/lib/python3.9/site-packages/channels_rabbitmq/reader.py", line 37, in consume_into_multi_queue_until_connection_close
    multi_queue.put_nowait(
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 223, in __aexit__
    await self.closed
  File "/usr/local/lib/python3.9/site-packages/channels_rabbitmq/reader.py", line 32, in consume_into_multi_queue_until_connection_close
    body, delivery_tag = await consumer.next_delivery()
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 196, in next_delivery
    return await _next_delivery(self._queue, self.closed)
  File "/usr/local/lib/python3.9/site-packages/carehare/_consume_channel.py", line 50, in _next_delivery
    closed.result()  # raise exception if there is one
carehare._exceptions.ConnectionClosed

In the Django Channels consumer, we most possibly have the same problem:
Django Channels Consumer error

Disconnected from RabbitMQ: RabbitMQ closed the connection: 320 CONNECTION_FORCED - Node was put into maintenance mode. Will reconnect.
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/channels_rabbitmq/core.py", line 263, in _reconnect_forever
    await connection.closed
carehare._exceptions.ConnectionClosedByServer: RabbitMQ closed the connection: 320 CONNECTION_FORCED - Node was put into maintenance mode

We like to reconnect automatically after a connection loss, how do you solve this problem?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions