From 8949dcc07abe98adebcb44746345a4d5e69b8f51 Mon Sep 17 00:00:00 2001
From: Suraj Nath <9503187+electron0zero@users.noreply.github.com>
Date: Sun, 3 Aug 2025 21:33:41 +0530
Subject: [PATCH] revamp: rework stories into posts
---
_config.yml | 10 +-
_includes/header.html | 3 +-
_posts/2017-01-31-gitlab-database-incident.md | 24 +++
_posts/2017-04-15-engineering-war-stories.md | 16 ++
_posts/2018-10-21-github-outage.md | 21 +++
_posts/2019-06-03-google-cloud-disruption.md | 21 +++
_posts/2019-07-02-cloudflare-regex-outage.md | 24 +++
.../2019-11-18-launchpad-upstream-outage.md | 16 ++
_posts/2020-01-23-grafana-labs-gcp-outage.md | 21 +++
_posts/2020-02-12-destiny-2-outage.md | 21 +++
.../2020-02-28-netflix-container-incident.md | 21 +++
_posts/2020-04-09-github-april-disruptions.md | 21 +++
_posts/2020-05-03-algolia-salt-incident.md | 21 +++
_posts/2020-05-12-slack-may-12-outage.md | 21 +++
_posts/2020-05-30-algolia-ssl-incident.md | 21 +++
.../2020-07-15-twitter-security-incident.md | 21 +++
...2020-07-19-debugging-distributed-system.md | 16 ++
_posts/2020-11-25-aws-kinesis-outage.md | 24 +++
_posts/2020-12-14-google-oauth-outage.md | 21 +++
_posts/2021-01-04-slack-january-4-outage.md | 21 +++
...11-salesforce-multi-instance-disruption.md | 21 +++
_posts/2021-06-08-fastly-global-outage.md | 23 +++
_posts/2021-10-04-facebook-global-outage.md | 27 ++++
_posts/2021-10-28-roblox-outage.md | 23 +++
_posts/2022-02-16-authzed-outage.md | 21 +++
...22-04-04-atlassian-multi-product-outage.md | 25 +++
_posts/2022-05-30-deno-outage.md | 21 +++
...2022-07-08-rogers-communications-outage.md | 21 +++
...22-07-19-uk-heatwave-datacenter-cooling.md | 24 +++
...2022-08-09-doordash-kubernetes-incident.md | 21 +++
_posts/2023-03-08-datadog-outage.md | 21 +++
index.html | 41 +----
pages/meetup.md | 2 +-
pages/stories.md | 149 -----------------
stories/index.html | 153 ++++++++++++++++++
35 files changed, 782 insertions(+), 196 deletions(-)
create mode 100644 _posts/2017-01-31-gitlab-database-incident.md
create mode 100644 _posts/2017-04-15-engineering-war-stories.md
create mode 100644 _posts/2018-10-21-github-outage.md
create mode 100644 _posts/2019-06-03-google-cloud-disruption.md
create mode 100644 _posts/2019-07-02-cloudflare-regex-outage.md
create mode 100644 _posts/2019-11-18-launchpad-upstream-outage.md
create mode 100644 _posts/2020-01-23-grafana-labs-gcp-outage.md
create mode 100644 _posts/2020-02-12-destiny-2-outage.md
create mode 100644 _posts/2020-02-28-netflix-container-incident.md
create mode 100644 _posts/2020-04-09-github-april-disruptions.md
create mode 100644 _posts/2020-05-03-algolia-salt-incident.md
create mode 100644 _posts/2020-05-12-slack-may-12-outage.md
create mode 100644 _posts/2020-05-30-algolia-ssl-incident.md
create mode 100644 _posts/2020-07-15-twitter-security-incident.md
create mode 100644 _posts/2020-07-19-debugging-distributed-system.md
create mode 100644 _posts/2020-11-25-aws-kinesis-outage.md
create mode 100644 _posts/2020-12-14-google-oauth-outage.md
create mode 100644 _posts/2021-01-04-slack-january-4-outage.md
create mode 100644 _posts/2021-05-11-salesforce-multi-instance-disruption.md
create mode 100644 _posts/2021-06-08-fastly-global-outage.md
create mode 100644 _posts/2021-10-04-facebook-global-outage.md
create mode 100644 _posts/2021-10-28-roblox-outage.md
create mode 100644 _posts/2022-02-16-authzed-outage.md
create mode 100644 _posts/2022-04-04-atlassian-multi-product-outage.md
create mode 100644 _posts/2022-05-30-deno-outage.md
create mode 100644 _posts/2022-07-08-rogers-communications-outage.md
create mode 100644 _posts/2022-07-19-uk-heatwave-datacenter-cooling.md
create mode 100644 _posts/2022-08-09-doordash-kubernetes-incident.md
create mode 100644 _posts/2023-03-08-datadog-outage.md
delete mode 100644 pages/stories.md
create mode 100644 stories/index.html
diff --git a/_config.yml b/_config.yml
index ac7104a..cb59e48 100644
--- a/_config.yml
+++ b/_config.yml
@@ -34,7 +34,6 @@ show_excerpts: true
# pages linked on top header
# NOTE: items are rendered in the order here
header_pages:
- - pages/stories.md
- pages/learn.md
- pages/meetup.md
- pages/meetup-pune.md
@@ -47,14 +46,13 @@ kramdown:
input: GFM
hard_wrap: false
-# posts to /incidents/ url parmalink
-# FIXME: find a better
-permalink: /:year-:month-:day/:title/
+# posts to /stories/ url permalink
+permalink: /stories/:title/
# pagination config
-paginate: 5
-paginate_path: "/page:num/"
+paginate: 10
+paginate_path: "/stories/page:num/"
excerpt_separator:
# Build settings
diff --git a/_includes/header.html b/_includes/header.html
index 0976924..cb01f63 100644
--- a/_includes/header.html
+++ b/_includes/header.html
@@ -17,9 +17,10 @@
+
Stories
{%- for path in page_paths -%}
{%- assign my_page = site.pages | where: "path", path | first -%}
- {%- if my_page.title -%}
+ {%- if my_page.title and my_page.title != 'Stories' -%}
{{ my_page.title | escape }}
{%- endif -%}
{%- endfor -%}
diff --git a/_posts/2017-01-31-gitlab-database-incident.md b/_posts/2017-01-31-gitlab-database-incident.md
new file mode 100644
index 0000000..68f5e96
--- /dev/null
+++ b/_posts/2017-01-31-gitlab-database-incident.md
@@ -0,0 +1,24 @@
+---
+layout: post
+title: "GitLab Database Incident"
+date: 2017-01-31
+categories: [outage, database, human-error]
+tags: [gitlab, database, postgresql, backup, human-error]
+company: GitLab
+incident_date: 2017-01-31
+duration: "18+ hours"
+affected_services: ["GitLab.com", "GitLab CI", "GitLab Pages"]
+---
+
+On January 31, 2017, a GitLab engineer accidentally deleted the production database directory while troubleshooting performance issues, running `rm -rf` on the wrong server. The incident revealed that all five backup systems had been failing silently, resulting in 18 hours of downtime and 6 hours of data loss.
+
+GitLab's transparent communication during the crisis - including live blogs, real-time updates, and a publicly shared recovery document - turned a disaster into a case study in crisis management and operational transparency.
+
+
+
+
+## Sources
+
+- [GitLab.com database incident - January 31st 2017](https://about.gitlab.com/blog/2017/02/01/gitlab-dot-com-database-incident/)
+- [Postmortem of database outage of January 31](https://about.gitlab.com/blog/2017/02/10/postmortem-of-database-outage-of-january-31/)
+- [Google Doc - Live incident response](https://docs.google.com/document/d/1GCK53YDcBWQveod9kfzW-VCxIABGiryG7_z_6jHdVik/pub) (Historical)
\ No newline at end of file
diff --git a/_posts/2017-04-15-engineering-war-stories.md b/_posts/2017-04-15-engineering-war-stories.md
new file mode 100644
index 0000000..02dd303
--- /dev/null
+++ b/_posts/2017-04-15-engineering-war-stories.md
@@ -0,0 +1,16 @@
+---
+layout: post
+title: "Engineering War Stories Collection"
+date: 2017-04-15
+company: "HackerNews Community"
+duration: "N/A"
+tags: [hackernews, community, engineering, war-stories, collection]
+---
+
+A HackerNews thread collecting engineering war stories from the community. Software engineers share their most memorable production incidents, debugging adventures, and lessons learned from system failures.
+
+The thread contains dozens of real-world experiences from engineers across different companies and industries, providing insights into common failure patterns and recovery strategies.
+
+## Sources
+
+- [Ask HN: Tell me an engineering war story from your career](https://news.ycombinator.com/item?id=13987098)
\ No newline at end of file
diff --git a/_posts/2018-10-21-github-outage.md b/_posts/2018-10-21-github-outage.md
new file mode 100644
index 0000000..77560b3
--- /dev/null
+++ b/_posts/2018-10-21-github-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "GitHub Outage"
+date: 2018-10-21
+categories: [outage, database, infrastructure]
+tags: [github, database, mysql, infrastructure, git]
+company: GitHub
+incident_date: 2018-10-21
+duration: "24+ hours"
+affected_services: ["GitHub.com", "Git Operations", "Issues", "Pull Requests"]
+---
+
+On October 21, 2018, GitHub experienced a major outage lasting over 24 hours due to network connectivity issues between their East and West Coast data centers. The incident resulted in database inconsistencies and required extensive data reconciliation to restore service.
+
+The outage affected millions of developers worldwide and highlighted the challenges of maintaining consistency in distributed systems during network partitions, leading to significant improvements in GitHub's infrastructure resilience.
+
+
+
+## Sources
+
+- [October 21 post-incident analysis - GitHub Blog](https://github.blog/2018-10-30-oct21-post-incident-analysis/)
\ No newline at end of file
diff --git a/_posts/2019-06-03-google-cloud-disruption.md b/_posts/2019-06-03-google-cloud-disruption.md
new file mode 100644
index 0000000..699e1cd
--- /dev/null
+++ b/_posts/2019-06-03-google-cloud-disruption.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Google Cloud June 3 2019 Service Disruption"
+date: 2019-06-03
+categories: [outage, cloud, networking]
+tags: [google-cloud, networking, infrastructure]
+company: Google Cloud
+incident_date: 2019-06-03
+duration: "Several hours"
+affected_services: ["Google Cloud Platform", "G Suite", "YouTube"]
+---
+
+On June 3, 2019, Google Cloud experienced a widespread service disruption affecting Google Cloud Platform, G Suite, and YouTube. The incident was caused by network congestion in the eastern United States that impacted Google's global infrastructure.
+
+The outage demonstrated how network-level issues can cascade through interconnected cloud services, affecting not just enterprise customers but also consumer services used by millions worldwide.
+
+
+
+## Sources
+
+- [June 3 2019 disruption](https://cloud.google.com/blog/topics/inside-google-cloud/an-update-on-sundays-service-disruption)
\ No newline at end of file
diff --git a/_posts/2019-07-02-cloudflare-regex-outage.md b/_posts/2019-07-02-cloudflare-regex-outage.md
new file mode 100644
index 0000000..ed565b4
--- /dev/null
+++ b/_posts/2019-07-02-cloudflare-regex-outage.md
@@ -0,0 +1,24 @@
+---
+layout: post
+title: "Details of the Cloudflare outage on July 2, 2019"
+date: 2019-07-02
+categories: [outage, performance, regex]
+tags: [cloudflare, regex, cpu, performance, waf]
+company: Cloudflare
+incident_date: 2019-07-02
+duration: "27 minutes"
+affected_services: ["Cloudflare CDN", "Web Application Firewall"]
+---
+
+On July 2, 2019, Cloudflare deployed a Web Application Firewall rule containing a regex with catastrophic backtracking behavior. The pattern consumed excessive CPU resources, causing global server unresponsiveness and widespread internet disruption for 27 minutes.
+
+The incident demonstrated how a single line of code can have massive global impact. Recovery was achieved by disabling the problematic WAF rule, leading to improvements in regex testing, staged rollouts, and automated circuit breakers for resource-intensive operations.
+
+
+
+
+## Sources
+
+- [Details of the Cloudflare outage on July 2, 2019](https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/)
+- [Regex Performance Analysis Tools](https://regex101.com/) - for testing regex patterns
+- [Regular Expression Denial of Service (ReDoS)](https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS)
\ No newline at end of file
diff --git a/_posts/2019-11-18-launchpad-upstream-outage.md b/_posts/2019-11-18-launchpad-upstream-outage.md
new file mode 100644
index 0000000..555db20
--- /dev/null
+++ b/_posts/2019-11-18-launchpad-upstream-outage.md
@@ -0,0 +1,16 @@
+---
+layout: post
+title: "Launchpad Upstream Outage Chain Reaction"
+date: 2019-11-18
+company: "Ubuntu/Canonical"
+duration: "Several hours"
+tags: [launchpad, ubuntu, upstream, chain-reaction, dependencies]
+---
+
+An upstream outage at Launchpad caused a cascading chain of issues across multiple Ubuntu and open-source project infrastructure systems. This incident demonstrates how upstream service failures can trigger widespread downstream effects.
+
+The outage highlighted the interconnected nature of modern software development infrastructure and the importance of understanding dependency chains when building resilient systems.
+
+## Sources
+
+- [Impact of an upstream outage - chain of issues due to launchpad outage](https://github.com/electron0zero/failure-modes/issues/58)
\ No newline at end of file
diff --git a/_posts/2020-01-23-grafana-labs-gcp-outage.md b/_posts/2020-01-23-grafana-labs-gcp-outage.md
new file mode 100644
index 0000000..1969ecc
--- /dev/null
+++ b/_posts/2020-01-23-grafana-labs-gcp-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Grafana Labs GCP Outage"
+date: 2020-01-23
+categories: [outage, cloud, infrastructure]
+tags: [grafana, gcp, persistent-disk, cloud, monitoring]
+company: Grafana Labs
+incident_date: 2020-01-23
+duration: "23 hours"
+affected_services: ["Grafana Cloud", "Monitoring Services"]
+---
+
+On January 23, 2020, Grafana Labs experienced a 23-hour outage caused by a Google Cloud Platform persistent disk incident. What started as a GCP infrastructure issue snowballed into an extended outage due to cascading failures and recovery complications.
+
+The incident demonstrated how cloud provider issues can compound with application-level problems, teaching important lessons about cloud dependency management and disaster recovery planning.
+
+
+
+## Sources
+
+- [How a GCP Persistent Disk incident snowballed into a 23-hour outage - Grafana Labs](https://grafana.com/blog/2020/01/23/how-a-gcp-persistent-disk-incident-snowballed-into-a-23-hour-outage-and-taught-us-some-important-lessons/)
\ No newline at end of file
diff --git a/_posts/2020-02-12-destiny-2-outage.md b/_posts/2020-02-12-destiny-2-outage.md
new file mode 100644
index 0000000..384409c
--- /dev/null
+++ b/_posts/2020-02-12-destiny-2-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Destiny 2 Outage"
+date: 2020-02-12
+categories: [outage, gaming, infrastructure]
+tags: [bungie, destiny-2, gaming, infrastructure]
+company: Bungie
+incident_date: 2020-02-12
+duration: "Several hours"
+affected_services: ["Destiny 2", "Game Servers", "Matchmaking"]
+---
+
+On February 12, 2020, Bungie's Destiny 2 experienced a significant outage affecting game servers and matchmaking services. The incident prevented players from accessing the online multiplayer game and highlighted the infrastructure challenges of maintaining always-online gaming services.
+
+Gaming platforms face unique scalability and reliability challenges, as players expect near-zero downtime for their entertainment and social experiences.
+
+
+
+## Sources
+
+- [Destiny 2 Outage Feb 12, 2020](https://www.bungie.net/en/Explore/Detail/News/48723)
\ No newline at end of file
diff --git a/_posts/2020-02-28-netflix-container-incident.md b/_posts/2020-02-28-netflix-container-incident.md
new file mode 100644
index 0000000..efd8187
--- /dev/null
+++ b/_posts/2020-02-28-netflix-container-incident.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Netflix Container Infrastructure Incident"
+date: 2020-02-28
+categories: [outage, containers, infrastructure]
+tags: [netflix, containers, kubernetes, infrastructure]
+company: Netflix
+incident_date: 2020-02-28
+duration: "Several hours"
+affected_services: ["Netflix Streaming", "Container Infrastructure"]
+---
+
+Netflix experienced an incident where containers were taking out nodes in their infrastructure, affecting streaming services. The issue highlighted the challenges of container orchestration at Netflix's massive scale and the potential for container-level issues to impact entire nodes.
+
+The incident demonstrated the complexities of managing containerized workloads in production environments and the importance of proper resource isolation and monitoring.
+
+
+
+## Sources
+
+- [Containers taking out nodes](https://twitter.com/sargun/status/1228495222658613250?s=19)
\ No newline at end of file
diff --git a/_posts/2020-04-09-github-april-disruptions.md b/_posts/2020-04-09-github-april-disruptions.md
new file mode 100644
index 0000000..4e147a0
--- /dev/null
+++ b/_posts/2020-04-09-github-april-disruptions.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "GitHub April 2020 Disruptions"
+date: 2020-04-09
+categories: [outage, infrastructure, git]
+tags: [github, infrastructure, git, distributed-systems]
+company: GitHub
+incident_date: 2020-04-09
+duration: "Several hours"
+affected_services: ["GitHub.com", "Git Operations", "Actions", "Pages"]
+---
+
+In April 2020, GitHub experienced multiple service disruptions affecting various platform components including Git operations, GitHub Actions, and Pages. The incidents occurred during increased usage due to remote work adoption during the COVID-19 pandemic.
+
+The disruptions highlighted the challenges of maintaining service reliability during unprecedented traffic growth and the critical role of version control platforms in remote development workflows.
+
+
+
+## Sources
+
+- [April 2020 disruptions](https://github.blog/2020-05-22-april-service-disruptions-analysis/)
\ No newline at end of file
diff --git a/_posts/2020-05-03-algolia-salt-incident.md b/_posts/2020-05-03-algolia-salt-incident.md
new file mode 100644
index 0000000..6eb9ed5
--- /dev/null
+++ b/_posts/2020-05-03-algolia-salt-incident.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Salt Incident: May 3rd, 2020 Retrospective and Update"
+date: 2020-05-03
+categories: [outage, infrastructure, security]
+tags: [algolia, salt, configuration-management, security]
+company: Algolia
+incident_date: 2020-05-03
+duration: "Several hours"
+affected_services: ["Algolia Search", "Configuration Management"]
+---
+
+On May 3, 2020, Algolia experienced a service disruption related to their Salt configuration management system. The incident affected search services and highlighted vulnerabilities in configuration management infrastructure.
+
+The outage demonstrated the critical role of configuration management systems in maintaining service reliability and the potential security implications when these systems are compromised or misconfigured.
+
+
+
+## Sources
+
+- [Salt Incident May 2020](https://blog.algolia.com/salt-incident-may-3rd-2020-retrospective-and-update/)
\ No newline at end of file
diff --git a/_posts/2020-05-12-slack-may-12-outage.md b/_posts/2020-05-12-slack-may-12-outage.md
new file mode 100644
index 0000000..109fa16
--- /dev/null
+++ b/_posts/2020-05-12-slack-may-12-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Slack May 12, 2020 Outage"
+date: 2020-05-12
+categories: [outage, infrastructure, distributed-systems]
+tags: [slack, infrastructure, messaging, distributed-systems]
+company: Slack
+incident_date: 2020-05-12
+duration: "8+ hours"
+affected_services: ["Slack Messaging", "File Sharing", "Search"]
+---
+
+On May 12, 2020, Slack experienced a major outage caused by HAProxy configuration bugs that prevented proper service discovery with Consul. The incident began with database issues at 8:30 AM Pacific, followed by the main outage at 4:45 PM Pacific, which was resolved by rolling restart of the HAProxy fleet.
+
+The outage occurred during peak remote work adoption due to COVID-19, highlighting the critical dependency of distributed teams on messaging platforms and the cascading effects of infrastructure configuration issues.
+
+
+
+## Sources
+
+- [A terrible, horrible, no good, very bad day at Slack - Slack Engineering](https://slack.engineering/a-terrible-horrible-no-good-very-bad-day-at-slack/)
\ No newline at end of file
diff --git a/_posts/2020-05-30-algolia-ssl-incident.md b/_posts/2020-05-30-algolia-ssl-incident.md
new file mode 100644
index 0000000..dc45fcd
--- /dev/null
+++ b/_posts/2020-05-30-algolia-ssl-incident.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "May 30 SSL incident"
+date: 2020-05-30
+categories: [outage, ssl, security]
+tags: [algolia, ssl, certificate, security]
+company: Algolia
+incident_date: 2020-05-30
+duration: "Several hours"
+affected_services: ["Algolia Search", "SSL/TLS Services"]
+---
+
+On May 30, 2020, Algolia experienced an SSL certificate-related incident that affected their search services. The incident involved certificate management issues that prevented secure connections to Algolia's services.
+
+SSL certificate management remains a common source of outages across the industry, highlighting the importance of automated certificate renewal and monitoring systems.
+
+
+
+## Sources
+
+- [SSL incident May 30](https://www.algolia.com/blog/engineering/may-30-ssl-incident/)
\ No newline at end of file
diff --git a/_posts/2020-07-15-twitter-security-incident.md b/_posts/2020-07-15-twitter-security-incident.md
new file mode 100644
index 0000000..4ca53d6
--- /dev/null
+++ b/_posts/2020-07-15-twitter-security-incident.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Twitter Security Incident"
+date: 2020-07-15
+categories: [security, social-engineering, incident]
+tags: [twitter, security, social-engineering, bitcoin, scam]
+company: Twitter
+incident_date: 2020-07-15
+duration: "Several hours"
+affected_services: ["Twitter", "High-Profile Accounts"]
+---
+
+On July 15, 2020, Twitter experienced a major security incident where attackers used social engineering to gain access to internal tools and compromise high-profile accounts including Barack Obama, Elon Musk, and Bill Gates. The compromised accounts were used to promote a Bitcoin scam.
+
+The incident highlighted vulnerabilities in social engineering attacks against employees with privileged access and led to significant improvements in Twitter's internal security practices and access controls.
+
+
+
+## Sources
+
+- [An update on our security incident - Twitter Blog](https://blog.twitter.com/en_us/topics/company/2020/an-update-on-our-security-incident.html)
\ No newline at end of file
diff --git a/_posts/2020-07-19-debugging-distributed-system.md b/_posts/2020-07-19-debugging-distributed-system.md
new file mode 100644
index 0000000..9bfa48a
--- /dev/null
+++ b/_posts/2020-07-19-debugging-distributed-system.md
@@ -0,0 +1,16 @@
+---
+layout: post
+title: "Debugging a Misbehaving Distributed System"
+date: 2020-07-19
+company: "Community Story"
+duration: "N/A"
+tags: [debugging, distributed-systems, community, networking]
+---
+
+A community-shared story about debugging complex distributed system issues, where seemingly unrelated symptoms led to discovering deep architectural problems.
+
+This Twitter thread by Erin provides insights into the detective work required when distributed systems start misbehaving in unexpected ways.
+
+## Sources
+
+- [Original Twitter Thread by Erin](https://twitter.com/erincandescent/status/1281280157073002496)
\ No newline at end of file
diff --git a/_posts/2020-11-25-aws-kinesis-outage.md b/_posts/2020-11-25-aws-kinesis-outage.md
new file mode 100644
index 0000000..11477c6
--- /dev/null
+++ b/_posts/2020-11-25-aws-kinesis-outage.md
@@ -0,0 +1,24 @@
+---
+layout: post
+title: "Summary of the Amazon Kinesis Event in the Northern Virginia (US-EAST-1) Region"
+date: 2020-11-25
+categories: [outage, cloud, cascade-failure]
+tags: [aws, kinesis, capacity, cascading-failure, dependency]
+company: AWS
+incident_date: 2020-11-25
+duration: "20 hours"
+affected_services: ["Kinesis", "Lambda", "CloudWatch", "Elasticsearch", "AutoScaling"]
+---
+
+On November 25, 2020, AWS Kinesis experienced an outage in US-EAST-1 when new capacity additions caused servers to exceed maximum operating system thread limits. The issue began at 2:44 AM PST with the first customer impact at 5:15 AM PST, cascading through CloudWatch, Lambda, ECS, Cognito, and other services until resolution at 10:23 PM PST.
+
+The incident demonstrated how infrastructure changes can trigger cascading failures through tightly coupled services. What started as an operating system resource limit issue eventually impacted customer monitoring, alerting, and serverless applications worldwide, highlighting critical dependencies in cloud architecture.
+
+
+
+
+## Sources
+
+- [Summary of the Amazon Kinesis Event in the Northern Virginia (US-EAST-1) Region - November, 25th 2020](https://aws.amazon.com/message/11201/)
+- [Twitter Thread Summary by Gergely Orosz](https://twitter.com/GergelyOrosz/status/1337871810738655235)
+- [Detailed Technical Analysis](https://threadreaderapp.com/thread/1337869823204847616.html)
\ No newline at end of file
diff --git a/_posts/2020-12-14-google-oauth-outage.md b/_posts/2020-12-14-google-oauth-outage.md
new file mode 100644
index 0000000..6714d14
--- /dev/null
+++ b/_posts/2020-12-14-google-oauth-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Google OAuth Service Disruption"
+date: 2020-12-14
+categories: [outage, authentication, oauth]
+tags: [google, oauth, authentication, identity]
+company: Google
+incident_date: 2020-12-14
+duration: "45 minutes"
+affected_services: ["Google OAuth", "Third-party Authentication"]
+---
+
+On December 14, 2020, Google's OAuth service experienced a disruption that prevented users from authenticating with third-party applications and services. The outage affected any application or service that relied on "Sign in with Google" functionality.
+
+The incident highlighted the critical dependency many applications have on centralized authentication providers and the cascading effects when these identity services become unavailable.
+
+
+
+## Sources
+
+- [OAuth unavailable Dec 2020](https://status.cloud.google.com/incident/zall/20013)
\ No newline at end of file
diff --git a/_posts/2021-01-04-slack-january-4-outage.md b/_posts/2021-01-04-slack-january-4-outage.md
new file mode 100644
index 0000000..157da36
--- /dev/null
+++ b/_posts/2021-01-04-slack-january-4-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Slack January 4, 2021 Outage"
+date: 2021-01-04
+categories: [outage, messaging, infrastructure]
+tags: [slack, messaging, infrastructure, remote-work]
+company: Slack
+incident_date: 2021-01-04
+duration: "Several hours"
+affected_services: ["Slack Messaging", "Notifications", "File Sharing"]
+---
+
+On January 4, 2021, Slack experienced a significant outage affecting messaging, notifications, and file sharing services. The incident occurred during the height of remote work adoption, impacting millions of users starting their work week.
+
+The outage demonstrated the critical dependency modern workplaces have on messaging platforms and the challenges of maintaining service reliability when these platforms become essential business infrastructure.
+
+
+
+## Sources
+
+- [Slack's outage on January 4th 2021](https://slack.engineering/slacks-outage-on-january-4th-2021/)
\ No newline at end of file
diff --git a/_posts/2021-05-11-salesforce-multi-instance-disruption.md b/_posts/2021-05-11-salesforce-multi-instance-disruption.md
new file mode 100644
index 0000000..f4b39f9
--- /dev/null
+++ b/_posts/2021-05-11-salesforce-multi-instance-disruption.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Salesforce Multi-Instance Disruption"
+date: 2021-05-11
+categories: [outage, saas, infrastructure]
+tags: [salesforce, saas, multi-tenant, infrastructure]
+company: Salesforce
+incident_date: 2021-05-11
+duration: "Several hours"
+affected_services: ["Salesforce CRM", "Multiple Instances", "Lightning Platform"]
+---
+
+On May 11, 2021, Salesforce experienced a multi-instance service disruption affecting multiple regions and customer instances. The incident impacted core CRM functionality and the Lightning Platform, affecting thousands of organizations worldwide.
+
+The outage highlighted the challenges of maintaining service reliability across Salesforce's multi-tenant architecture and the critical dependency many businesses have on CRM platforms for their daily operations.
+
+
+
+## Sources
+
+- [Multi-Instance Disruption May 2021](https://help.salesforce.com/s/articleView?id=000358392&type=1)
\ No newline at end of file
diff --git a/_posts/2021-06-08-fastly-global-outage.md b/_posts/2021-06-08-fastly-global-outage.md
new file mode 100644
index 0000000..133dbce
--- /dev/null
+++ b/_posts/2021-06-08-fastly-global-outage.md
@@ -0,0 +1,23 @@
+---
+layout: post
+title: "Summary of June 8 outage"
+date: 2021-06-08
+categories: [outage, cdn, configuration]
+tags: [fastly, cdn, configuration, edge-computing, deployment]
+company: Fastly
+incident_date: 2021-06-08
+duration: "49 minutes"
+affected_services: ["Fastly CDN", "Edge Compute"]
+---
+
+On June 8, 2021, Fastly experienced a 49-minute global CDN outage affecting major websites including Amazon, Reddit, GitHub, and The New York Times. A routine customer configuration change triggered a dormant software bug that caused 85% of Fastly's network to return HTTP 503 errors instead of serving content.
+
+The bug was introduced in a May 12th software deployment but remained undetected until the specific configuration pattern was used. Recovery involved identifying and rolling back the problematic configuration globally, highlighting the risks of configuration complexity in distributed systems.
+
+
+
+
+## Sources
+
+- [Summary of June 8 outage - Fastly](https://www.fastly.com/blog/summary-of-june-8-outage)
+- [Service Incidents and Outages - Fastly Status](https://status.fastly.com/)
\ No newline at end of file
diff --git a/_posts/2021-10-04-facebook-global-outage.md b/_posts/2021-10-04-facebook-global-outage.md
new file mode 100644
index 0000000..b67d66e
--- /dev/null
+++ b/_posts/2021-10-04-facebook-global-outage.md
@@ -0,0 +1,27 @@
+---
+layout: post
+title: "Update about the October 4th outage"
+date: 2021-10-04
+categories: [outage, networking, bgp]
+tags: [facebook, instagram, whatsapp, bgp, dns, networking]
+company: Facebook
+incident_date: 2021-10-04
+duration: "6+ hours"
+affected_services: ["Facebook", "Instagram", "WhatsApp", "Oculus"]
+---
+
+On October 4, 2021, Facebook experienced a 6+ hour global outage affecting Facebook, Instagram, WhatsApp, and Oculus. During routine backbone capacity maintenance, a bug in Facebook's audit tool caused BGP routes to be withdrawn, disconnecting their data centers from the internet and causing DNS cascade failures.
+
+The outage created additional complications as Facebook's internal systems, including door badge systems, were also down, making it physically difficult for engineers to access data centers. The incident highlighted risks of centralized internet infrastructure and the cascading effects of network-level failures.
+
+
+
+
+## Sources
+
+- [Understanding How Facebook Disappeared from the Internet](https://blog.cloudflare.com/october-2021-facebook-outage/)
+- [What happened on the Internet during the Facebook outage](https://blog.cloudflare.com/during-the-facebook-outage/)
+- [Update about the October 4th outage - Facebook Engineering](https://engineering.fb.com/2021/10/04/networking-traffic/outage/)
+- [More details about the October 4 outage - Facebook Engineering](https://engineering.fb.com/2021/10/05/networking-traffic/outage-details/)
+- [What Happened to Facebook, Instagram, WhatsApp? Krebs on Security](https://krebsonsecurity.com/2021/10/what-happened-to-facebook-instagram-whatsapp/)
+- [Why was Facebook down for five hours? - YouTube](https://www.youtube.com/watch?v=-wMU8vmfaYo) - Ben Eater's technical explanation
\ No newline at end of file
diff --git a/_posts/2021-10-28-roblox-outage.md b/_posts/2021-10-28-roblox-outage.md
new file mode 100644
index 0000000..2fdf48c
--- /dev/null
+++ b/_posts/2021-10-28-roblox-outage.md
@@ -0,0 +1,23 @@
+---
+layout: post
+title: "Roblox Return to Service - 10/28 - 10/31, 2021"
+date: 2021-10-28
+categories: [outage, gaming, infrastructure]
+tags: [roblox, gaming, infrastructure, scaling, user-load]
+company: Roblox
+incident_date: 2021-10-28
+duration: "73 hours"
+affected_services: ["Roblox Platform", "Game Development", "Virtual Economy"]
+---
+
+On October 28, 2021, Roblox experienced a 73-hour complete platform outage affecting over 50 million daily users. The incident was caused by infrastructure complexity issues during routine system operations, with tightly coupled services, database constraints, and cascading failures preventing recovery.
+
+The outage impacted the entire gaming ecosystem - players couldn't access games, developers lost revenue, and virtual currency transactions were frozen. Recovery was complicated by the need to restore services in specific sequences while maintaining data consistency across the platform's complex architecture.
+
+
+
+
+## Sources
+
+- [Roblox Return to Service - 10/28 - 10/31, 2021](https://blog.roblox.com/2022/01/roblox-return-to-service-10-28-10-31-2021/)
+- [Roblox Status Page](https://status.roblox.com/)
\ No newline at end of file
diff --git a/_posts/2022-02-16-authzed-outage.md b/_posts/2022-02-16-authzed-outage.md
new file mode 100644
index 0000000..139bbb3
--- /dev/null
+++ b/_posts/2022-02-16-authzed-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Post-Mortem: Feb 16 2022"
+date: 2022-02-16
+categories: [outage, infrastructure, authorization]
+tags: [authzed, authorization, infrastructure]
+company: Authzed
+incident_date: 2022-02-16
+duration: "Several hours"
+affected_services: ["Authzed Authorization Service"]
+---
+
+On February 16, 2022, Authzed experienced a service outage affecting their authorization platform. The incident impacted customers relying on Authzed for fine-grained authorization and permission management.
+
+The outage highlighted the critical nature of authorization services in modern applications and the cascading effects when these foundational services become unavailable.
+
+
+
+## Sources
+
+- [Post-Mortem Feb 16 2022](https://authzed.com/blog/post-mortem-feb-2022/)
\ No newline at end of file
diff --git a/_posts/2022-04-04-atlassian-multi-product-outage.md b/_posts/2022-04-04-atlassian-multi-product-outage.md
new file mode 100644
index 0000000..612e839
--- /dev/null
+++ b/_posts/2022-04-04-atlassian-multi-product-outage.md
@@ -0,0 +1,25 @@
+---
+layout: post
+title: "Multi-Product, Multi-Week Outage - April 4th, 2022"
+date: 2022-04-04
+categories: [outage, database, maintenance]
+tags: [atlassian, jira, confluence, database, maintenance-gone-wrong]
+company: Atlassian
+incident_date: 2022-04-04
+duration: "2+ weeks"
+affected_services: ["Jira", "Confluence", "Bitbucket", "Opsgenie"]
+---
+
+On April 4, 2022, Atlassian began routine maintenance to delete inactive user accounts but a script logic error caused active customer data to be deleted instead. This affected 775+ customers for over two weeks, making it one of the longest outages in modern SaaS history.
+
+Recovery was complicated by the scale of affected customers and data dependencies between Atlassian products. The incident highlighted critical gaps in maintenance script testing, backup independence, and the need for circuit breakers in destructive operations.
+
+
+
+
+## Sources
+
+- [Multi-Product, Multi-Week Outage - April 4th, 2022](https://www.atlassian.com/engineering/april-2022-outage-update)
+- [Day 7 of the great Atlassian outage: IT giant still struggling to restore access](https://www.theregister.com/2022/04/11/atlassian_still_down/)
+- [Post-Incident Review by Atlassian](https://www.atlassian.com/engineering/post-incident-review-april-2022-outage)
+- [The Scoop: Inside the Longest Atlassian Outage of All Time](https://newsletter.pragmaticengineer.com/p/scoop-atlassian?ref=blog.pragmaticengineer.com)
\ No newline at end of file
diff --git a/_posts/2022-05-30-deno-outage.md b/_posts/2022-05-30-deno-outage.md
new file mode 100644
index 0000000..da6acb0
--- /dev/null
+++ b/_posts/2022-05-30-deno-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Deno Outage"
+date: 2022-05-30
+categories: [outage, infrastructure, javascript]
+tags: [deno, infrastructure, javascript, runtime]
+company: Deno
+incident_date: 2022-05-30
+duration: "Several hours"
+affected_services: ["Deno Deploy", "Deno Registry"]
+---
+
+On May 30, 2022, Deno experienced an outage affecting their deployment platform and package registry. The incident impacted developers using Deno's cloud services and highlighted the dependencies modern development workflows have on external services.
+
+The outage demonstrated how infrastructure failures can ripple through the JavaScript ecosystem, affecting not just deployed applications but also development and deployment pipelines.
+
+
+
+## Sources
+
+- [May 30 incident update 2022](https://deno.com/blog/2022-05-30-outage-post-mortem)
\ No newline at end of file
diff --git a/_posts/2022-07-08-rogers-communications-outage.md b/_posts/2022-07-08-rogers-communications-outage.md
new file mode 100644
index 0000000..bfb5924
--- /dev/null
+++ b/_posts/2022-07-08-rogers-communications-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "Rogers Communications Canada Outage"
+date: 2022-07-08
+categories: [outage, telecommunications, networking]
+tags: [rogers, telecommunications, canada, networking]
+company: Rogers Communications
+incident_date: 2022-07-08
+duration: "15+ hours"
+affected_services: ["Mobile Network", "Internet", "Cable TV", "Emergency Services"]
+---
+
+On July 8, 2022, Rogers Communications experienced a nationwide outage across Canada affecting mobile, internet, and cable services for over 15 hours. The outage also impacted critical services including 911 emergency services, payment systems, and government services.
+
+The incident highlighted the risks of telecommunications infrastructure concentration and the cascading effects when a major carrier fails, affecting not just consumers but critical national infrastructure.
+
+
+
+## Sources
+
+- [Canada outage July 2022](https://blog.cloudflare.com/cloudflares-view-of-the-rogers-communications-outage-in-canada/)
\ No newline at end of file
diff --git a/_posts/2022-07-19-uk-heatwave-datacenter-cooling.md b/_posts/2022-07-19-uk-heatwave-datacenter-cooling.md
new file mode 100644
index 0000000..4a56233
--- /dev/null
+++ b/_posts/2022-07-19-uk-heatwave-datacenter-cooling.md
@@ -0,0 +1,24 @@
+---
+layout: post
+title: "UK Heatwave Datacenter Failures"
+date: 2022-07-19
+categories: [outage, infrastructure, climate]
+tags: [cooling, heatwave, climate, datacenter, infrastructure, oracle, google]
+company: Multiple
+incident_date: 2022-07-19
+duration: "19 hours"
+affected_services: ["Oracle Cloud", "Google Cloud", "Multiple Datacenters"]
+---
+
+On July 19, 2022, the UK experienced record-breaking temperatures reaching 40.3°C (104.5°F) for the first time in history. This extreme heat caused two cooling units at Oracle Cloud's UK South region to fail when operating above their design limits, resulting in protective equipment shutdowns that lasted approximately 19 hours.
+
+The incident highlighted the growing intersection between climate change and digital infrastructure reliability, demonstrating how infrastructure designed for historical climate conditions can fail during extreme weather events that are becoming more frequent.
+
+
+
+
+## Sources
+
+- [Oracle Cloud UK South Cooling Failure - July 19, 2022](https://ocistatus.oraclecloud.com/#/incidents/ocid1.oraclecloudincident.oc1.phx.amaaaaaavwew44aa7zoskanlspjh4ll6wxhwxrbkbed4d4cnupxexzqzvlyq)
+- [Google Cloud London Region Status](https://status.cloud.google.com/incidents/fmEL9i2fArADKawkZAa2)
+- [UK Met Office Temperature Records](https://www.metoffice.gov.uk/)
\ No newline at end of file
diff --git a/_posts/2022-08-09-doordash-kubernetes-incident.md b/_posts/2022-08-09-doordash-kubernetes-incident.md
new file mode 100644
index 0000000..1a4f11f
--- /dev/null
+++ b/_posts/2022-08-09-doordash-kubernetes-incident.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "DoorDash Kubernetes Health Checks Incident"
+date: 2022-08-09
+categories: [outage, kubernetes, health-checks]
+tags: [doordash, kubernetes, health-checks, black-friday]
+company: DoorDash
+incident_date: 2022-08-09
+duration: "Several hours"
+affected_services: ["DoorDash Platform", "Order Processing"]
+---
+
+DoorDash experienced a significant incident related to Kubernetes health check configurations during a high-traffic period similar to Black Friday. The incident demonstrated how misconfigurations in health checks can cascade through containerized applications during peak load.
+
+The outage highlighted the critical importance of properly configured health checks in Kubernetes environments and the challenges of maintaining service reliability during traffic spikes in food delivery platforms.
+
+
+
+## Sources
+
+- [How to handle Kubernetes Health Checks - Black Friday](https://doordash.engineering/2022/08/09/how-to-handle-kubernetes-health-checks/)
\ No newline at end of file
diff --git a/_posts/2023-03-08-datadog-outage.md b/_posts/2023-03-08-datadog-outage.md
new file mode 100644
index 0000000..836c2f1
--- /dev/null
+++ b/_posts/2023-03-08-datadog-outage.md
@@ -0,0 +1,21 @@
+---
+layout: post
+title: "DataDog Multi-Region Outage"
+date: 2023-03-08
+categories: [outage, monitoring, infrastructure]
+tags: [datadog, monitoring, multi-region, infrastructure]
+company: DataDog
+incident_date: 2023-03-08
+duration: "Several hours"
+affected_services: ["DataDog Monitoring", "Dashboards", "Alerting"]
+---
+
+On March 8, 2023, DataDog experienced a multi-region infrastructure connectivity issue affecting their monitoring and observability platform. The outage impacted customers' ability to monitor their own systems, creating a blind spot during the incident.
+
+The incident highlighted the irony of monitoring service outages - when your monitoring system is down, it becomes difficult to monitor everything else, emphasizing the need for diverse monitoring strategies.
+
+
+
+## Sources
+
+- [Multi-Region Outage March 2023](https://www.datadoghq.com/blog/2023-03-08-multiregion-infrastructure-connectivity-issue/)
\ No newline at end of file
diff --git a/index.html b/index.html
index b640894..d387ca4 100644
--- a/index.html
+++ b/index.html
@@ -39,51 +39,14 @@
Building Software Systems? Learn about
View Resources
+
Contribute to Failure Modes
Have a story, postmortem, or resource to share?
Submit it to our collection.