From 9511eb28eae54a332aba9ec39552eec795ab4e04 Mon Sep 17 00:00:00 2001 From: hazemadelkhalel Date: Tue, 4 Feb 2025 14:37:52 +0000 Subject: [PATCH] feat: add incident response handbook docs --- .../engineering/incident-response.mdx | 60 +++++++++++++++++++ docs/mint.json | 5 +- 2 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 docs/handbook/engineering/incident-response.mdx diff --git a/docs/handbook/engineering/incident-response.mdx b/docs/handbook/engineering/incident-response.mdx new file mode 100644 index 0000000000..202ab5557f --- /dev/null +++ b/docs/handbook/engineering/incident-response.mdx @@ -0,0 +1,60 @@ +--- +title: "Incident Response" +icon: 'bell-ring' +--- + +Incident.io is our primary tool for managing and responding to urgent issues and service disruptions. This guide explains how we use incident.io to coordinate our on-call rotations and emergency response procedures. + +## Setup and Notifications + +### Personal Setup +1. Download the incident.io mobile app from your device's app store +2. Ask your team to add you to the incident.io workspace +3. Configure your notification preferences: + - Phone calls for critical incidents + - Push notifications for high-priority issues + - Slack notifications for standard updates + +### On-Call Rotations +Our team operates on a weekly rotation schedule through incident.io, where every team member participates. When you're on-call: +- You'll receive priority notifications for all urgent issues +- Phone calls will be placed for critical service disruptions +- Rotations change every week, with handoffs occurring on Monday mornings +- Response is expected within 15 minutes for critical incidents + + + + If you are unable to respond to an incident, please escalate to the engineering team. + + + +## Creating an Incident + +There are three ways to create an incident: + +### Channel 1: Fire Emoji Reaction through Slack + +1. When a customer reports an issue in Slack +2. React to the message with the 🔥 emoji +3. This will automatically: + - Create a new incident + - Page the on-call engineer + - Create a dedicated incident channel + - Link the original message for context + +### Channel 2: GitHub Issue + +1. Create a new issue in the GitHub repository +2. Add the `high priority` label +3. This for immediate attention from the on-call engineer + +### Channel 3: Outage Notification through Digital Ocean / Checkly + +1. Digital Ocean will automatically notify us when there is an outage in CPU, Memory or Disk +2. Checkly will automatically notify us when the e2e test fails and the website is down + + +## Best Practices +- Always acknowledge incident notifications +- Update the incident status regularly +- Document any actions taken in the incident channel \ No newline at end of file diff --git a/docs/mint.json b/docs/mint.json index 9c7ccc7215..6a55c10bb4 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -102,8 +102,9 @@ "icon": "code", "pages": [ "handbook/engineering/how-we-work", - "handbook/engineering/pre-releases", - "handbook/engineering/on-call" + "handbook/engineering/on-call", + "handbook/engineering/incident-response", + "handbook/engineering/pre-releases" ] }, {