1 line
No EOL
76 KiB
JSON
1 line
No EOL
76 KiB
JSON
{"slug": "dastergon--awesome-sre", "title": "Sre", "description": "A curated list of Site Reliability and Production Engineering resources.", "github_url": "https://github.com/dastergon/awesome-sre", "stars": "9.1K", "tag": "Miscellaneous", "entry_count": 494, "subcategory_count": 1, "subcategories": [{"name": "Contributing", "parent": "", "entries": [{"name": "Culture", "url": "#culture", "description": ""}, {"name": "Education", "url": "#education", "description": ""}, {"name": "Books", "url": "#books", "description": ""}, {"name": "Hiring", "url": "#hiring", "description": ""}, {"name": "Reliability", "url": "#reliability", "description": ""}, {"name": "Monitoring & Observability & Alerting", "url": "#monitoring--observability--alerting", "description": ""}, {"name": "On-Call", "url": "#on-call", "description": ""}, {"name": "Post-Mortem", "url": "#post-mortem", "description": ""}, {"name": "Capacity Planning", "url": "#capacity-planning", "description": ""}, {"name": "Service Level Agreement", "url": "#service-level-agreement", "description": ""}, {"name": "Performance", "url": "#performance", "description": ""}, {"name": "Programming", "url": "#programming", "description": ""}, {"name": "Misc Articles", "url": "#misc-articles", "description": ""}, {"name": "Real-time Messaging", "url": "#real-time-messaging", "description": ""}, {"name": "Blogs", "url": "#blogs", "description": ""}, {"name": "Newsletters", "url": "#newsletters", "description": ""}, {"name": "Conferences & Meetups", "url": "#conferences-meetups", "description": ""}, {"name": "Twitter", "url": "#twitter", "description": ""}, {"name": "SRE Tools", "url": "#sre-tools", "description": ""}, {"name": "SRE Podcasts", "url": "#podcasts", "description": ""}, {"name": "What is Site Reliability Engineering?", "url": "https://landing.google.com/sre/interview/ben-treynor.html", "description": ""}, {"name": "Keys To SRE by Ben Treynor", "url": "https://www.usenix.org/conference/srecon14/technical-sessions/presentation/keys-sre", "description": ""}, {"name": "Google SRE Resources", "url": "https://landing.google.com/sre/resources.html", "description": ""}, {"name": "Notes from Production Engineering by Pedro Canahuati", "url": "https://www.usenix.org/conference/srecon15/program/presentation/canahuati", "description": ""}, {"name": "PostOps: Recovery from Operations", "url": "https://www.usenix.org/conference/srecon15europe/program/presentation/underwood", "description": ""}, {"name": "Love DevOps? Wait 'till you meet SRE", "url": "https://www.atlassian.com/it-service/site-reliability-engineering-sre", "description": ""}, {"name": "How Google Does Planet-Scale Engineering for Planet-Scale Infra", "url": "https://www.youtube.com/watch?v=H4vMcD7zKM0", "description": ""}, {"name": "Site Reliability Engineering at Facebook", "url": "https://www.facebook.com/notes/facebook-engineering/site-reliability-engineering-at-facebook/291616313919/", "description": ""}, {"name": "A History of Site Reliability Engineering at Uber", "url": "https://www.youtube.com/watch?v=qJnS-EfIIIE\\&nohtml5=False", "description": ""}, {"name": "Case Study: Adopting SRE Principles at StackOverflow", "url": "https://www.usenix.org/conference/srecon15/program/presentation/limoncelli", "description": ""}, {"name": "Site Reliability Engineering at Dropbox", "url": "https://www.youtube.com/watch?v=ggizCjUCCqE", "description": ""}, {"name": "Site Reliability Engineers \u2014 Keeping Google up and running 24/7", "url": "https://www.youtube.com/watch?v=yXI7r0_J29M", "description": ""}, {"name": "Site Reliability Engineering at Salesforce", "url": "https://www.salesforce.com/video/193050/", "description": ""}, {"name": "SRE@Google: Thousands of DevOps Since 2004", "url": "https://www.youtube.com/watch?v=iIuTnhdTzK0", "description": ""}, {"name": "Transactional System Administration Is Killing Us and Must be Stopped", "url": "https://www.usenix.org/conference/lisa15/conference-program/presentation/limoncelli", "description": ""}, {"name": "A hierarchy of SRE needs", "url": "https://web.archive.org/web/20190401220948/https://plus.google.com/+lizthegrey/posts/MLAJFVyEb2f", "description": ""}, {"name": "PostOps: A Non-Surgical Tale of Software, Fragility, and Reliability", "url": "https://www.usenix.org/conference/lisa13/technical-sessions/plenary/underwood", "description": ""}, {"name": "SRE: An incomplete guide to cultural Narnia", "url": "https://web.archive.org/web/20180820235243/http://anthonycaiafa.com/2016/04/10/sre-cultural-narnia/", "description": "[\\[Video\\]](https://www.youtube.com/watch?v=__wypEhdcrQ\\&t=0s)"}, {"name": "Putting Together Great SRE Teams", "url": "https://www.usenix.org/conference/srecon16/program/presentation/krishnan", "description": ""}, {"name": "Work at Google: Meet our Production Engineers for Site Reliability Hangout on Air", "url": "https://www.youtube.com/watch?v=bwt6TZjefGM", "description": ""}, {"name": "Toil: A Word Every Engineer Should Know", "url": "https://sharpend.io/toil-a-word-every-engineer-should-know/", "description": ""}, {"name": "Engineering Reliability into Web Sites: Google SRE", "url": "https://research.google.com/pubs/pub32583.html", "description": ""}, {"name": "DEVOPS & SRE AMA - Building High Performance Organizations", "url": "https://vimeo.com/179914447", "description": ""}, {"name": "John Allspaw's AMA on Incident Analysis and Postmortems", "url": "https://community.atlassian.com/t5/Jira-Ops-questions/I-m-John-Allspaw-Ask-Me-Anything-about-incident-analysis-and/qaq-p/957084", "description": ""}, {"name": "How SysAdmins Devalue Themselves", "url": "https://queue.acm.org/detail.cfm?id=2891413", "description": ""}, {"name": "The Softer Side of DevOps", "url": "https://www.youtube.com/watch?v=ry51Llzil1I", "description": ""}, {"name": "SRE, noun. See also: confidence, trust.", "url": "https://medium.com/@kobolog/sre-noun-see-also-confidence-trust-e7e33e19efc1", "description": ""}, {"name": "Site Reliability Engineering with Stephen Weinberg", "url": "https://youtu.be/24xb7oZgu-I?t=29m24s", "description": ""}, {"name": "We are the Google Site Reliability team. We make Google\u2019s websites work. Ask us Anything!", "url": "https://www.reddit.com/r/IAmA/comments/177267/we_are_the_google_site_reliability_team_we_make", "description": ""}, {"name": "We are the Google Site Reliability Engineering team. Ask us Anything!", "url": "https://www.reddit.com/r/IAmA/comments/1w1y5m/we_are_the_google_site_reliability_engineering/", "description": ""}, {"name": "The Ops Identity Crisis", "url": "http://www.susanjfowler.com/blog/2016/10/13/the-ops-identity-crisis", "description": ""}, {"name": "The Irreproducibility Of Bugs In Large-Scale Production Systems", "url": "http://www.susanjfowler.com/blog/2016/11/2/the-irreproducibility-of-bugs-in-large-scale-production-systems", "description": ""}, {"name": "SE-Radio Episode 276: Bj\u00f6rn Rabenstein on Site Reliability Engineering", "url": "http://www.se-radio.net/2016/12/se-radio-episode-276-bjorn-rabenstein-on-site-reliability-engineering/", "description": ""}, {"name": "Microservices, DevOps and Production Complexity", "url": "https://blog.netsil.com/microservices-devops-and-operational-complexity-be98cb01b660", "description": ""}, {"name": "Introducing Google Customer Reliability Engineering", "url": "https://cloudplatform.googleblog.com/2016/10/introducing-a-new-era-of-customer-support-Google-Customer-Reliability-Engineering.html", "description": ""}, {"name": "Evolution or Rebellion? The rise of Site Reliability Engineers (SRE)", "url": "https://robhirschfeld.com/2016/12/29/evolution-or-rebellion-the-rise-of-site-reliability-engineers-sre/", "description": ""}, {"name": "The difference between Site Reliability Engineering, System Administration, and DevOps", "url": "https://standalone-sysadmin.com/the-difference-between-site-reliability-engineering-system-administration-and-devops-d05031495499", "description": ""}, {"name": "SRE in the Small and in the Large", "url": "https://www.usenix.org/conference/lisa16/conference-program/presentation/closing-plenary", "description": ""}, {"name": "SBSRE Meetup: Different SRE roles and challenges(Netflix)", "url": "https://www.youtube.com/watch?v=zLXf0cKDOv0", "description": ""}, {"name": "Panel: Who/What Is SRE?", "url": "https://www.usenix.org/conference/srecon16/program/presentation/definition-of-sre-panel", "description": ""}, {"name": "Hope Is Not a Strategy", "url": "https://medium.com/@jerub/hope-is-not-a-strategy-6a7d0a3b1c08", "description": ""}, {"name": "Tenets of SRE", "url": "https://medium.com/@jerub/tenets-of-sre-8af6238ae8a8", "description": ""}, {"name": "Site Reliability Engineering Demystified", "url": "https://medium.com/@venkatachalamrangasamy/site-reliability-engineering-demystified-ed676e0a7d56", "description": ""}, {"name": "Is Site Reliability Engineering the True \u2018Ops\u2019 in DevOps?", "url": "https://devops.com/site-reliability-engineering-sre-true-ops-devops/", "description": ""}, {"name": "SRE vs. DevOps vs. Cloud Native: The Server Cage Match", "url": "https://devops.com/sre-devops-cloud-native-server-cage-match/", "description": ""}, {"name": "SRE: What\u2019s The Big Idea?", "url": "https://youtu.be/8dfYLRAWn_c", "description": ""}, {"name": "Building the SRE Culture at LinkedIn", "url": "https://engineering.linkedin.com/blog/2017/05/building-the-sre-culture-at-linkedin", "description": ""}, {"name": "Podcast #111 \u2013 SRE: Occasionally Maintaining Infrastructure That You Hate", "url": "https://stackoverflow.blog/2017/06/12/podcast-111-sre-occasionally-maintaining-infrastructure-hate/", "description": ""}, {"name": "Splicing SRE DNA Sequences in the Biggest Software Company on the Planet", "url": "https://www.usenix.org/conference/srecon16europe/program/presentation/splicing-sre-dna-sequences-biggest-software-company", "description": ""}, {"name": "Why should your app get SRE support? - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/06/why-should-your-app-get-SRE-support-CRE-life-lessons.html", "description": ""}, {"name": "How SREs find the landmines in a service - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/06/how-SREs-find-the-landmines-in-a-service-CRE-life-lessons.html", "description": ""}, {"name": "Making the most of an SRE service takeover - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/07/making-the-most-of-an-SRE-service-takeover-CRE-life-lessons.html", "description": ""}, {"name": "The Cloudcast #301: SRE and Infrastructure Operations (Podcast)", "url": "https://dzone.com/articles/the-cloudcast-301-sre-and-infrastructure-operation", "description": ""}, {"name": "The SRE model", "url": "https://medium.com/@rakyll/the-sre-model-6e19376ef986", "description": ""}, {"name": "Onboarding New Site Reliability Engineers", "url": "https://circleci.com/blog/onboarding-new-site-reliability-engineers/", "description": ""}, {"name": "Building Blocks for Site Reliability At Google", "url": "https://www.youtube.com/watch?v=nQv9ySa8MTU", "description": ""}, {"name": "Beyond Google SRE: What is Site Reliability Engineering like at Medium?", "url": "https://blog.netsil.com/beyond-google-sre-what-is-site-reliability-engineering-like-at-medium-71c65bd35f4e", "description": ""}, {"name": "Intelligent Site Reliability Engineering \u2013 A Machine Learning Perspective", "url": "http://blog.adnanmasood.com/2016/05/19/intelligent-site-reliability-engineering-a-machine-learning-perspective/", "description": ""}, {"name": "A crash course in LinkedIn's global site operations", "url": "https://engineering.linkedin.com/day-life/crash-course-linkedins-global-site-operations", "description": ""}, {"name": "Google\u2019s Site Reliability Engineering with Todd Underwood", "url": "https://softwareengineeringdaily.com/2016/06/14/googles-site-reliability-engineering-todd-underwood/", "description": ""}, {"name": "What is Site Reliability Engineering? (VMware)", "url": "https://blogs.vmware.com/services-education-insights/2018/02/site-reliability-engineering.html", "description": ""}, {"name": "A Gentle Introduction to SRE", "url": "http://geekologist.co/introduction-to-sre/", "description": ""}, {"name": "Understanding Site Reliability Engineering through Movies and Books", "url": "http://engineering.medallia.com/blog/posts/understanding-site-reliability-engineering-through-movies-and-books/", "description": ""}, {"name": "GOTO 2017 \u2022 Site Reliability Engineering at Google \u2022 Christof Leng", "url": "https://www.youtube.com/watch?v=Cxb7a8lTv8A", "description": ""}, {"name": "Tech Leadership in SRE", "url": "https://www.youtube.com/watch?v=6G2V1xPIM64", "description": ""}, {"name": "The Azure Podcast: Episode 227 - Azure SRE", "url": "http://azpodcast.azurewebsites.net/post/Episode-227-Azure-SRE1", "description": ""}, {"name": "The human scalability of \"DevOps\"", "url": "https://medium.com/@mattklein123/the-human-scalability-of-devops-e36c37d3db6a", "description": ""}, {"name": "Podcast: Site Reliability Management with Mike Hiraga", "url": "https://softwareengineeringdaily.com/2018/04/09/site-reliability-management-with-mike-hiraga/", "description": ""}, {"name": "How a cat inspired system reliability at Knowlarity", "url": "https://medium.com/@Knowlarity_Engineering/how-a-cat-inspired-system-reliability-at-knowlarity-ad73c24f29a7", "description": ""}, {"name": "Getting Started with Site Reliability Engineering", "url": "https://github.com/devopsenterprise/2018-London/blob/master/Tuesday/Breakout%20Sessions/Throne%2C%20Stephen%2C%20Getting%20Started%20with%20Site%20Reliability%20Engineering.pdf", "description": "", "stars": "111"}, {"name": "\"Practical Applications of the Dickerson Pyramid\" by Nat Welch", "url": "https://www.youtube.com/watch?v=xWAfTAu0Mww", "description": ""}, {"name": "LinkedIn\u2019s Kurt Andersen Uncovers Blindspots in SRE Implementations", "url": "https://blameless.com/blog/sre-implementations-blindspots/", "description": ""}, {"name": "Interview with Betsy Beyer, Stephen Thorne of Google", "url": "https://driftboatdave.com/2018/10/09/interview-with-betsy-beyer-stephen-thorne-of-google/", "description": ""}, {"name": "Less Risk Through Greater Humanity - Dave Rensin", "url": "https://www.youtube.com/watch?v=0zqBlRW_6jA", "description": ""}, {"name": "Getting Started with SRE - Stephen Thorne, Google", "url": "https://www.youtube.com/watch?v=c-w_GYvi0eA", "description": ""}, {"name": "Building Successful SRE in Large Enterprises", "url": "https://drive.google.com/file/d/1FXwHm6mpmRA9NaIJEu4cB1s6ffbyGBfl/view", "description": ""}, {"name": "Solving Reliability Fears with Site Reliability Engineering", "url": "https://www.youtube.com/watch?v=ZcZtU_TiFEM", "description": ""}, {"name": "SRE vs. DevOps: competing standards or close friends?", "url": "https://cloud.google.com/blog/products/gcp/sre-vs-devops-competing-standards-or-close-friends", "description": ""}, {"name": "How to Avoid the 5 SRE Implementation Traps that Catch Even the Best Teams", "url": "https://thenewstack.io/how-to-avoid-the-5-sre-implementation-traps-that-catch-even-the-best-teams/", "description": ""}, {"name": "Reliability Engineering \u2013 The Essential Discipline for Complex Systems", "url": "https://vimeo.com/344515149", "description": ""}, {"name": "The Modern Site Reliability Workbench on Top of OCI", "url": "https://www.youtube.com/watch?v=bC5dIPzNH24", "description": ""}, {"name": "SRE in the Third Age", "url": "https://www.usenix.org/conference/srecon19emea/presentation/rabenstein", "description": ""}, {"name": "About SRE and how (not) to apply it", "url": "https://www.youtube.com/watch?v=vF6ajM3P_wM", "description": ""}, {"name": "Transitioning a typical engineering ops team into an SRE powerhouse", "url": "https://cloud.google.com/blog/products/management-tools/transitioning-a-typical-engineering-ops-team-into-an-sre-powerhouse", "description": ""}, {"name": "Making a Lion Bulletproof: SRE in Banking", "url": "https://www.infoq.com/presentations/ing-sre-teams-practices/", "description": ""}, {"name": "Identifying and tracking toil using SRE principles", "url": "https://cloud.google.com/blog/products/management-tools/identifying-and-tracking-toil-using-sre-principles", "description": ""}, {"name": "From Ops to SRE: Evolution of the OpenShift Dedicated Team", "url": "https://www.openshift.com/blog/from-ops-to-sre-evolution-of-the-openshift-dedicated-team", "description": ""}, {"name": "Meeting reliability challenges with SRE principles", "url": "https://cloud.google.com/blog/products/management-tools/meeting-reliability-challenges-with-sre-principles", "description": ""}, {"name": "A quick introduction to SRE principles", "url": "https://github.com/fhivemind/sre-playground", "description": "", "stars": "24"}, {"name": "The SRE I Aspire to Be", "url": "https://www.youtube.com/watch?v=KnC2eRUZMKY", "description": ""}, {"name": "Taming Operational Load with VMware CRE", "url": "https://tanzu.vmware.com/content/blog/taming-operational-load-vmware-cre", "description": ""}, {"name": "SRE Cultural Values", "url": "https://dubrie.medium.com/sre-cultural-values-a0073b475183", "description": ""}, {"name": "Are we there yet? Thoughts on assessing an SRE team\u2019s maturity", "url": "https://cloud.google.com/blog/products/devops-sre/evaluating-where-your-team-lies-on-the-sre-spectrum", "description": ""}, {"name": "What SREs have to do with project-based services?", "url": "https://www.linkedin.com/pulse/what-sres-have-do-project-based-services-rod-anami/", "description": ""}, {"name": "Making operational work more visible", "url": "https://github.com/readme/guides/ops-work-visible", "description": ""}, {"name": "SRE vs. DevOps: What\u2019s the Difference Between Them?", "url": "https://spacelift.io/blog/sre-vs-devops", "description": ""}, {"name": "Panel: Educating SRE", "url": "https://www.usenix.org/conference/srecon15/program/presentation/sebenik", "description": ""}, {"name": "From Zero to Hero: Recommended Practices for Training your Ever-Evolving SRE Teams", "url": "https://www.usenix.org/conference/srecon15/program/presentation/widdowson", "description": ""}, {"name": "New to an SRE team?", "url": "https://www.linkedin.com/pulse/new-sre-team-anthony-caiafa/", "description": ""}, {"name": "The Systems Engineering Side of Site Reliability Engineering", "url": "https://www.usenix.org/publications/login/june15/hixson", "description": ""}, {"name": "Graduating from Bootcamp and interested in becoming a Site Reliability Engineer?", "url": "https://medium.com/@tammybutow/graduating-from-bootcamp-and-interested-in-becoming-a-site-reliability-engineer-b69a38ce858b", "description": ""}, {"name": "So you want to be a Site Reliability Engineer?", "url": "https://www.loomsystems.com/single-post/2016/03/23/So-you-want-to-be-a-Site-Reliability-Engineer", "description": ""}, {"name": "Spiraling Ops Debt & the SRE Coding Imperative", "url": "https://www.loomsystems.com/blog/2017/02/06/spiraling-ops-debt-the-sre-coding-imperative", "description": ""}, {"name": "So you want to be an SRE?", "url": "https://hackernoon.com/so-you-want-to-be-an-sre-34e832357a8c", "description": ""}, {"name": "Career Profiles/Site Reliability Engineer", "url": "https://www.khanacademy.org/college-careers-more/career-content/career-profile-videos/site-reliability-engineer/v/ruth-grace-site-reliability-engineer-what-i-do-and-how-much-i-make", "description": ""}, {"name": "What is the role of a Site Reliability Engineer?", "url": "https://cloudacademy.com/blog/what-is-the-role-of-a-site-reliability-engineer/", "description": ""}, {"name": "Lynda.com: DevOps Foundations: Site Reliability Engineering", "url": "https://www.lynda.com/Software-Development-tutorials/DevOps-Foundations-Site-Reliability-Engineering/669542-2.html", "description": ""}, {"name": "Incident Management Training: Wheel of Misfortune", "url": "https://dastergon.gr/wheel-of-misfortune/", "description": ""}, {"name": "Site Un-Reliability Engineering \\[Video Series\\]", "url": "https://www.youtube.com/watch?v=rmY8_PHanuI", "description": ""}, {"name": "The Ultimate Guide to Structuring a 90-Day Onboarding Plan", "url": "https://medium.com/swlh/the-ultimate-guide-to-structuring-a-90-day-onboarding-plan-c91af947376", "description": ""}, {"name": "SRE fundamentals: SLIs, SLAs and SLOs", "url": "https://cloud.google.com/blog/products/gcp/sre-fundamentals-slis-slas-and-slos", "description": ""}, {"name": "How to Get Into SRE", "url": "https://blog.alicegoldfuss.com/how-to-get-into-sre/", "description": ""}, {"name": "Do you have an SRE team yet? How to start and assess your journey", "url": "https://cloud.google.com/blog/products/devops-sre/how-to-start-and-assess-your-sre-journey", "description": ""}, {"name": "How SRE teams are organized, and how to get started", "url": "https://cloud.google.com/blog/products/devops-sre/how-sre-teams-are-organized-and-how-to-get-started", "description": ""}, {"name": "Why SRE Documents Matter", "url": "https://queue.acm.org/detail.cfm?id=3283589", "description": ""}, {"name": "How to get started with site reliability engineering (SRE)", "url": "https://www.oreilly.com/ideas/how-to-get-started-with-site-reliability-engineering-sre", "description": ""}, {"name": "Duties of a Site Reliability Engineering Manager", "url": "https://victorops.com/blog/duties-of-a-site-reliability-engineering-manager", "description": ""}, {"name": "Designing distributed systems using NALSD flashcards", "url": "https://cloud.google.com/blog/products/management-tools/sre-principles-and-flashcards-to-design-nalsd", "description": ""}, {"name": "Training Site Reliability Engineers: What Your Organization Needs to Create a Learning Program", "url": "https://landing.google.com/sre/resources/practicesandprocesses/training-site-reliability-engineers", "description": ""}, {"name": "SRE Classroom: Distributed PubSub workshop", "url": "https://landing.google.com/sre/resources/practicesandprocesses/sre-classroom/", "description": ""}, {"name": "School of SRE: Curriculum for onboarding non-traditional hires and new grads", "url": "https://linkedin.github.io/school-of-sre/", "description": ""}, {"name": "Practical Linux Infrastructure", "url": "https://link.springer.com/book/10.1007/978-1-4842-0511-2", "description": ""}, {"name": "Site Reliability Engineering: How Google Runs Production Systems", "url": "https://landing.google.com/sre/book.html", "description": ""}, {"name": "The Site Reliability Workbook: Practical Ways to Implement SRE", "url": "https://landing.google.com/sre/book.html", "description": ""}, {"name": "Observability Engineering: Achieving Production Excellence", "url": "https://info.honeycomb.io/observability-engineering-oreilly-book-2022", "description": ""}, {"name": "The Practice Of Cloud System Administration: Designing and Operating Large Distributed Systems", "url": "http://the-cloud-book.com/", "description": ""}, {"name": "Web Operations - Keeping the Data On Time", "url": "http://shop.oreilly.com/product/0636920000136.do", "description": ""}, {"name": "The Checklist Manifesto: How to Get Things Right", "url": "http://atulgawande.com/book/the-checklist-manifesto/", "description": ""}, {"name": "Microservices in Production - Standard Principles and Requirements", "url": "http://www.oreilly.com/programming/free/microservices-in-production.csp", "description": ""}, {"name": "Production-Ready Microservices - Building Standardized Systems Across an Engineering Organization", "url": "http://shop.oreilly.com/product/0636920053675.do", "description": ""}, {"name": "Systems Performance: Enterprise and the Cloud", "url": "https://www.amazon.com/Systems-Performance-Enterprise-Brendan-Gregg/dp/0133390098/", "description": ""}, {"name": "Monitoring Distributed Systems: Case Studies from Google's SRE Teams", "url": "http://www.oreilly.com/webops-perf/free/monitoring-distributed-systems.csp", "description": ""}, {"name": "The Human Side of Postmortems: Managing Stress and Cognitive Biases", "url": "http://www.oreilly.com/webops-perf/free/the-human-side-of-postmortems.csp", "description": ""}, {"name": "Chaos Engineering: Building Confidence in System Behavior through Experiment", "url": "http://www.oreilly.com/webops-perf/free/chaos-engineering.csp", "description": ""}, {"name": "Post-Incident Reviews: Learning from Failure for Improved Incident Responses", "url": "https://victorops.com/oreilly-post-incident-review/", "description": ""}, {"name": "Antifragile Systems and Teams", "url": "http://www.oreilly.com/webops-perf/free/antifragile-systems-and-teams.csp", "description": ""}, {"name": "How to Monitoring the SRE Golden Signals (E-Book)", "url": "https://www.slideshare.net/OpsStack/how-to-monitoring-the-sre-golden-signals-ebook/", "description": ""}, {"name": "Incident Management for Operations", "url": "http://shop.oreilly.com/product/0636920036159.do", "description": ""}, {"name": "Real-World SRE", "url": "https://www.packtpub.com/web-development/real-world-sre", "description": ""}, {"name": "Seeking SRE", "url": "http://shop.oreilly.com/product/0636920063964.do", "description": ""}, {"name": "What is SRE?", "url": "https://www.verizondigitalmedia.com/e-book/oreilly-what-is-sre/", "description": ""}, {"name": "Engineering Reliable Mobile Applications: Strategies for Developing Resilient Native Mobile Applications", "url": "https://landing.google.com/sre/resources/practicesandprocesses/engineering-reliable-mobile-applications/", "description": ""}, {"name": "Building Secure and Reliable Systems", "url": "https://landing.google.com/sre/book.html", "description": ""}, {"name": "Chaos Engineering: Crash test your applications", "url": "https://www.manning.com/books/chaos-engineering/", "description": ""}, {"name": "97 Things Every SRE Should Know", "url": "https://www.oreilly.com/library/view/97-things-every/9781492081487/", "description": ""}, {"name": "Four Steps to Creating Effective Game Day Tests", "url": "https://shopify.engineering/four-steps-creating-effective-game-day-tests", "description": ""}, {"name": "The Linux Programming Interface", "url": "https://nostarch.com/tlpi", "description": ""}, {"name": "SRE Hiring", "url": "https://www.usenix.org/conference/srecon15/program/presentation/fong", "description": ""}, {"name": "Hiring SREs at LinkedIn", "url": "https://engineering.linkedin.com/engineering-culture/hiring-sres-linkedin", "description": ""}, {"name": "Hiring Site Reliability Engineers", "url": "https://www.usenix.org/publications/login/june15/hiring-site-reliability-engineers", "description": ""}, {"name": "Hiring your first SRE", "url": "https://sreally.com/hiring-your-first-sre-bdda38ee175d#.2m3sqyuw9", "description": ""}, {"name": "Growing the Site Reliability Team at LinkedIn: Hiring is Hard", "url": "https://www.youtube.com/watch?v=ZemNg9GYvOA", "description": ""}, {"name": "Engineering Manager - Site Reliability Engineering Interview Preparation", "url": "https://danrl.com/blog/srm", "description": ""}, {"name": "The Realities of the Job of Delivering Reliability", "url": "https://www.usenix.org/conference/srecon16/program/presentation/kroll", "description": ""}, {"name": "Fail at Scale by Ben Maurer", "url": "http://queue.acm.org/detail.cfm?id=2839461", "description": ""}, {"name": "Embracing Failure: Fault-Injection and Service Reliability", "url": "https://www.youtube.com/watch?v=wrY7XoOnysg", "description": ""}, {"name": "10 Years of Crashing Google", "url": "https://www.usenix.org/conference/lisa15/conference-program/presentation/krishnan", "description": ""}, {"name": "How we break things at Twitter: failure testing", "url": "https://blog.twitter.com/2015/how-we-break-things-at-twitter-failure-testing", "description": ""}, {"name": "Reliable Cron across the Planet", "url": "http://queue.acm.org/detail.cfm?id=2745840", "description": ""}, {"name": "Push our limits - reliability testing at Twitter", "url": "https://blog.twitter.com/2014/push-our-limits-reliability-testing-at-twitter", "description": ""}, {"name": "The Verification of a Distributed System by Caitie McCaffrey", "url": "http://queue.acm.org/detail.cfm?ref=rss\\&id=2889274", "description": ""}, {"name": "Weathering the Unexpected", "url": "http://queue.acm.org/detail.cfm?id=2371516", "description": ""}, {"name": "SRE Hour: Tech Talks by Box & Yelp", "url": "https://www.youtube.com/watch?v=YFDwdRVTg4g", "description": ""}, {"name": "Simplicity: A Prerequisite for Reliability", "url": "https://sharpend.io/simplicity-a-prerequisite-for-reliability/", "description": ""}, {"name": "The Two Sides to Google Infrastructure for Everyone Else", "url": "https://speakerdeck.com/garethr/the-two-sides-to-google-infrastructure-for-everyone-else", "description": ""}, {"name": "How Embracing Continuous Release Reduced Change Complexity", "url": "https://www.usenix.org/conference/ures14west/summit-program/presentation/dickson", "description": ""}, {"name": "Making \"Push On Green\" a Reality", "url": "https://www.usenix.org/publications/login/october-2014-vol-39-no-5/making-push-green-reality", "description": ""}, {"name": "BeyondCorp: A New Approach to Enterprise Security", "url": "https://www.usenix.org/publications/login/dec14/ward", "description": ""}, {"name": "Brainstorming Failure by Jeff Smith", "url": "https://www.youtube.com/watch?v=dKe9S8u44Yk", "description": ""}, {"name": "The Ripple Effect Of Outages And Downtime Cannot Be Underestimated", "url": "http://cloudtweaks.com/2016/04/outages-and-downtime/", "description": ""}, {"name": "The infrastructure behind Twitter: efficiency and optimization", "url": "https://blog.twitter.com/2016/the-infrastructure-behind-twitter-efficiency-and-optimization", "description": ""}, {"name": "Dickerson's Hierarchy of Reliability", "url": "https://docs.google.com/drawings/d/1kshrK2RLkW-XV8enmWZxeRFRgADj6d4Ru_w5txz_k9I/edit", "description": ""}, {"name": "The Morning Paper on Operability", "url": "https://blog.acolyer.org/2016/09/21/the-morning-paper-on-operability/", "description": ""}, {"name": "Production is all that matters", "url": "http://naildrivin5.com/blog/2013/06/16/production-is-all-that-matters.html", "description": ""}, {"name": "Using load shedding to survive a success disaster - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2016/12/using-load-shedding-to-survive-a-success-disaster-CRE-life-lessons.html", "description": ""}, {"name": "How to avoid a self-inflicted DDoS Attack - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2016/11/how-to-avoid-a-self-inflicted-DDoS-Attack-CRE-life-lessons.html", "description": ""}, {"name": "Don't gamble when it comes to reliability", "url": "https://www.oreilly.com/ideas/dont-gamble-when-it-comes-to-reliability", "description": ""}, {"name": "Resilience Engineering: Learning to Embrace Failure", "url": "https://queue.acm.org/detail.cfm?id=2371297", "description": ""}, {"name": "The Infrastructure Behind Twitter: Scale", "url": "https://blog.twitter.com/2017/the-infrastructure-behind-twitter-scale", "description": ""}, {"name": "Scaling Reliability at Twitter: So You Want to Add a 9", "url": "https://www.youtube.com/watch?v=hYu13kBenjE", "description": ""}, {"name": "Principles Of Chaos Engineering", "url": "http://principlesofchaos.org/", "description": ""}, {"name": "Chaos Engineering", "url": "https://www.infoq.com/articles/chaos-engineering", "description": ""}, {"name": "Available...or not? That is the question - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/01/available-or-not-that-is-the-question-CRE-life-lessons.html", "description": ""}, {"name": "How Google Backs Up The Internet Along With Exabytes Of Other Data", "url": "http://highscalability.com/blog/2014/2/3/how-google-backs-up-the-internet-along-with-exabytes-of-othe.html", "description": ""}, {"name": "Performance, Scalability, And High Availability: 3 Key Infrastructure Adaptability Requirements", "url": "http://highscalability.com/blog/2017/2/2/performance-scalability-and-high-availability-3-key-infrastr.html", "description": ""}, {"name": "Reliable releases and rollbacks - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/03/reliable-releases-and-rollbacks-CRE-life-lessons.html", "description": ""}, {"name": "How release canaries can save your bacon - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/03/how-release-canaries-can-save-your-bacon-CRE-life-lessons.html", "description": ""}, {"name": "Things I Learned Managing Site Reliability for Some of the World\u2019s Busiest Gambling Sites", "url": "https://zwischenzugs.wordpress.com/2017/04/04/things-i-learned-managing-site-reliability-for-some-of-the-worlds-busiest-gambling-sites/", "description": ""}, {"name": "Every Day Is Monday in Operations", "url": "https://www.linkedin.com/pulse/introduction-every-day-monday-operations-benjamin-purgason", "description": ""}, {"name": "Under the Hood: Ensuring Site Reliability", "url": "https://engineering.squarespace.com/blog/2017/under-the-hood-ensuring-site-reliability", "description": ""}, {"name": "Designing reliable systems with cloud infrastructure (Google Cloud Next '17)", "url": "https://www.youtube.com/watch?v=7Hy_6SMn8pY", "description": ""}, {"name": "A Google SRE explores GitHub reliability with BigQuery", "url": "https://cloud.google.com/blog/big-data/2016/10/a-google-sre-explores-github-reliability-with-bigquery", "description": ""}, {"name": "Know thy enemy: how to prioritize and communicate risks - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/05/know-thy-enemy-how-to-prioritize-and-communicate-risks-CRE-life-lessons.html", "description": ""}, {"name": "Chaos Engineering resources", "url": "https://github.com/dastergon/awesome-chaos-engineering", "description": "", "stars": "5.2k"}, {"name": "CRE life lessons: What is a dark launch, and what does it do for me?", "url": "https://cloudplatform.googleblog.com/2017/08/CRE-life-lessons-what-is-a-dark-launch-and-what-does-it-do-for-me.html", "description": ""}, {"name": "Why you should pick strong consistency, whenever possible", "url": "https://cloudplatform.googleblog.com/2018/01/why-you-should-pick-strong-consistency-whenever-possible.html", "description": ""}, {"name": "The Network is Reliable", "url": "https://queue.acm.org/detail.cfm?id=2655736", "description": ""}, {"name": "Are You Load Balancing Wrong?", "url": "https://queue.acm.org/detail.cfm?id=3028689", "description": ""}, {"name": "How production engineers support global events on Facebook", "url": "https://code.facebook.com/posts/166966743929963/how-production-engineers-support-global-events-on-facebook/", "description": ""}, {"name": "Google: A Collection Of Best Practices For Production Services", "url": "http://highscalability.com/blog/2018/4/16/google-a-collection-of-best-practices-for-production-service.html", "description": ""}, {"name": "Canary Analysis Service", "url": "https://queue.acm.org/detail.cfm?id=3194655", "description": ""}, {"name": "Tips for High Availability", "url": "https://medium.com/@NetflixTechBlog/tips-for-high-availability-be0472f2599c", "description": ""}, {"name": "Progressive Service Architecture At Auth0", "url": "https://auth0.com/blog/progressive-service-architecture-at-auth0/", "description": ""}, {"name": "Google Cloud Production Guideline", "url": "https://medium.com/google-cloud/production-guideline-9d5d10c8f1e", "description": ""}, {"name": "production readiness", "url": "https://jbd.dev/prod-readiness/", "description": ""}, {"name": "Trust By Design: The Fusion of Operational Maturity and Risk Modeling", "url": "https://www.youtube.com/watch?v=Vvd3uvNvMns", "description": ""}, {"name": "Top Seven Myths of Robust Systems", "url": "https://www.verica.io/top-seven-myths-of-robust-systems/", "description": ""}, {"name": "Taming chaos: Preparing for your next incident", "url": "https://www.oreilly.com/ideas/taming-chaos-preparing-for-your-next-incident", "description": ""}, {"name": "PID Loops and the Art of Keeping Systems Stable", "url": "https://www.youtube.com/watch?v=3AxSwCC7I4s", "description": ""}, {"name": "Are you ready for production?", "url": "https://www.youtube.com/watch?v=YptJ2rrGAYY", "description": "[Slides](https://speakerdeck.com/rakyll/are-you-ready-for-production)"}, {"name": "Production Checklist for Web Apps on Kubernetes", "url": "https://srcco.de/posts/web-service-on-kubernetes-production-checklist-2019.html", "description": ""}, {"name": "Finding a problem at the bottom of the Google stack", "url": "https://cloud.google.com/blog/products/management-tools/sre-keeps-digging-to-prevent-problems", "description": ""}, {"name": "Rethinking Task Size in SRE", "url": "https://www.oreilly.com/content/rethinking-task-size-in-sre/", "description": ""}, {"name": "How maintenance windows affect your error budget", "url": "https://cloud.google.com/blog/products/management-tools/sre-error-budgets-and-maintenance-windows", "description": ""}, {"name": "The Production Readiness Spectrum", "url": "https://dastergon.gr/posts/2020/09/the-production-readiness-spectrum/", "description": ""}, {"name": "Generic mitigations", "url": "https://www.oreilly.com/content/generic-mitigations/", "description": ""}, {"name": "How we\u2019re building a production readiness review process at Grafana Labs", "url": "https://grafana.com/blog/2021/10/13/how-were-building-a-production-readiness-review-process-at-grafana-labs/", "description": ""}, {"name": "Resiliency Planning for High-Traffic Events", "url": "https://shopify.engineering/resiliency-planning-for-high-traffic-events", "description": ""}, {"name": "Using Fault Injection Testing to Improve DoorDash Reliability", "url": "https://doordash.engineering/2022/04/25/using-fault-injection-testing-to-improve-doordash-reliability/", "description": ""}, {"name": "A Working Theory-of-Monitoring", "url": "https://www.usenix.org/conference/lisa13/working-theory-monitoring", "description": ""}, {"name": "The Evolution of Monitoring Systems at Google - Tony Rippy", "url": "https://vimeo.com/131484321", "description": ""}, {"name": "Monitoring without Infrastructure @ Airbnb", "url": "https://www.usenix.org/conference/srecon15/program/presentation/serebryany", "description": ""}, {"name": "Monitoring distributed systems", "url": "https://www.oreilly.com/ideas/monitoring-distributed-systems", "description": ""}, {"name": "Observability at Uber Engineering: Past, Present, Future", "url": "https://www.youtube.com/watch?v=2JAnmzVwgP8", "description": ""}, {"name": "The 4 Golden Signals of API Health and Performance in Cloud-Native Applications", "url": "https://blog.netsil.com/the-4-golden-signals-of-api-health-and-performance-in-cloud-native-applications-a6e87526e74", "description": ""}, {"name": "My Philosophy on Alerting by Rob Ewaschuk", "url": "https://docs.google.com/document/d/199PqyG3UsyXlwieHaqbGiWVa8eMWi8zzAn0YfcApr8Q/preview#", "description": ""}, {"name": "Time To Detect - Netflix", "url": "https://www.youtube.com/watch?v=wsgpV67MLFo", "description": ""}, {"name": "Why Percentiles Don\u2019t Work the Way you Think", "url": "https://www.vividcortex.com/blog/why-percentiles-dont-work-the-way-you-think", "description": ""}, {"name": "Building Twitter\u2019s Next-Gen Alerting System", "url": "https://www.youtube.com/watch?v=jQggG0qIjTM", "description": ""}, {"name": "Instrumentation: Worst case performance matters", "url": "https://honeycomb.io/blog/2017/01/instrumentation-worst-case-performance-matters/", "description": ""}, {"name": "Instrumentation: What does 'uptime' mean?", "url": "https://honeycomb.io/blog/2017/01/instrumentation-what-does-uptime-mean/", "description": ""}, {"name": "Incidents + Outages at CircleCI: Our Playbook and What We\u2019ve Learned", "url": "https://circleci.com/blog/incidents-outages-at-circleci-our-playbook-and-what-we-ve-learned/", "description": ""}, {"name": "An introduction to monitoring and alerting with timeseries at scale, with Prometheus", "url": "https://www.youtube.com/watch?v=gNmWzkGViAY", "description": ""}, {"name": "Detecting outliers and anomalies in realtime at Datadog", "url": "https://www.youtube.com/watch?v=mG4ZpEhRKHA", "description": ""}, {"name": "How to Monitor the SRE Golden Signals", "url": "https://medium.com/devopslinks/how-to-monitor-the-sre-golden-signals-1391cadc7524", "description": ""}, {"name": "Monitoring in a DevOps World", "url": "https://queue.acm.org/detail.cfm?id=3178371", "description": ""}, {"name": "Monitoring Your Monitoring\u2019s Monitoring", "url": "https://medium.com/@jerub/monitoring-your-monitorings-monitoring-51d479100f4c", "description": ""}, {"name": "Observability: the new wave or buzzword?", "url": "https://medium.com/@dlite/observability-the-new-wave-or-buzzword-fc23a68abf72", "description": ""}, {"name": "Monitoring Isn't Observability", "url": "https://www.vividcortex.com/blog/monitoring-isnt-observability", "description": ""}, {"name": "Monitoring in the time of Cloud Native", "url": "https://medium.com/@copyconstruct/monitoring-in-the-time-of-cloud-native-c87c7a5bfa3e", "description": ""}, {"name": "Principles of Monitoring Microservices", "url": "https://www.youtube.com/watch?v=2LNHv0JyBUk", "description": ""}, {"name": "The Many Ways Your Monitoring Is Lying to You", "url": "https://www.usenix.org/node/197446", "description": ""}, {"name": "GitOps Part 3 - Observability", "url": "https://www.weave.works/blog/gitops-part-3-observability", "description": ""}, {"name": "Want to Debug Latency?", "url": "https://medium.com/observability/want-to-debug-latency-7aa48ecbe8f7", "description": ""}, {"name": "Debugging Latency in Go 1.11", "url": "https://medium.com/observability/debugging-latency-in-go-1-11-9f97a7910d68", "description": ""}, {"name": "Alerting on SLOs like Pros", "url": "https://developers.soundcloud.com/blog/alerting-on-slos", "description": ""}, {"name": "Applied Alerting Philosophy", "url": "https://www.youtube.com/watch?v=JhxfZ0VIPP0", "description": ""}, {"name": "Observations on Observability", "url": "https://blog.colinbreck.com/observations-on-observability/", "description": ""}, {"name": "Deploys: It's Not Actually About Fridays", "url": "https://charity.wtf/2019/10/28/deploys-its-not-actually-about-fridays/", "description": ""}, {"name": "Site Reliability Engineering Best Practices for Data Pipelines", "url": "https://medium.com/better-programming/site-reliability-engineering-best-practices-for-data-pipelines-44a78e91f6f0", "description": ""}, {"name": "Elastic Observability in SRE and Incident Response", "url": "https://www.elastic.co/blog/elastic-observability-sre-incident-response", "description": ""}, {"name": "Error Budget Policy - Part 1 - Adoption at Expedia Group", "url": "https://medium.com/expedia-group-tech/error-budget-policy-adoption-at-expedia-group-7d80d41c4a8b", "description": ""}, {"name": "Error Budget Policy - Part 2 - Practices at Expedia Group", "url": "https://medium.com/expedia-group-tech/error-budget-policies-in-practice-4c98f56a28c1", "description": ""}, {"name": "Being an On-Call Engineer: A Google SRE Perspective", "url": "http://research.google.com/pubs/pub44813.html", "description": ""}, {"name": "Inside Atlassian: how our site reliability engineers do incident management", "url": "https://www.atlassian.com/blog/it-teams/inside-atlassian-site-reliability-engineers-incident-management", "description": ""}, {"name": "Inside Atlassian: how IT & SRE use ChatOps to run incident management", "url": "https://www.atlassian.com/blog/2016/02/inside-atlassian-sre-use-chatops-run-incident-management", "description": ""}, {"name": "Incident Response at Heroku", "url": "https://blog.heroku.com/archives/2014/5/9/incident-response-at-heroku", "description": ""}, {"name": "Who's On Call?", "url": "http://www.susanjfowler.com/blog/2016/9/6/whos-on-call", "description": ""}, {"name": "SysAdvent - Day 6 - No More On-Call Martyrs", "url": "https://sysadvent.blogspot.com/2016/12/day-6-no-more-on-call-martyrs.html", "description": ""}, {"name": "On Being On Call", "url": "http://naildrivin5.com/blog/2016/12/07/on-call.html", "description": ""}, {"name": "The On-Call Handbook", "url": "https://github.com/alicegoldfuss/oncall-handbook", "description": "", "stars": "381"}, {"name": "Incident management at Google \u2014 adventures in SRE-land", "url": "https://cloudplatform.googleblog.com/2017/02/Incident-management-at-Google-adventures-in-SRE-land.html", "description": ""}, {"name": "Run Book / Operations Manual template", "url": "https://github.com/SkeltonThatcher/run-book-template", "description": "", "stars": "647"}, {"name": "Automating Your Oncall: Open Sourcing Fossor and Ascii Etch", "url": "https://engineering.linkedin.com/blog/2017/12/open-sourcing-fossor-and-ascii-etch", "description": ""}, {"name": "Project STAR\\*: Streamlining Our On-Call Process", "url": "https://engineering.linkedin.com/blog/2018/01/project-star-streamlining-our-on-call-process", "description": ""}, {"name": "SRE@Xero: Managing Incidents Part I", "url": "https://devblog.xero.com/sre-xero-managing-incidents-part-i-7d02d650a71c", "description": ""}, {"name": "SRE@Xero: Managing Incidents Part II", "url": "https://devblog.xero.com/sre-xero-managing-incidents-part-ii-224a6e06f426", "description": ""}, {"name": "How To Establish a High Severity Incident Management Program", "url": "https://www.gremlin.com/how-to-establish-a-high-severity-incident-management-program/", "description": ""}, {"name": "How Your Systems Keep Running Day After Day - John Allspaw", "url": "https://www.youtube.com/watch?v=xA5U85LSk0M", "description": ""}, {"name": "On-call doesn\u2019t have to suck", "url": "https://medium.com/@copyconstruct/on-call-b0bd8c5ea4e0", "description": ""}, {"name": "Why, as a Netflix infrastructure manager, am I on call?", "url": "https://medium.com/@awspyker/why-as-a-netflix-infrastructure-manager-am-i-on-call-bdc551ac01fe", "description": ""}, {"name": "Oncall and Sustainable Software Development", "url": "https://honeycomb.io/blog/2018/02/oncall-and-sustainable-software-development/", "description": ""}, {"name": "On Call Rotations: How Best to Wake Devs Up in the Middle of the Night", "url": "https://thenewstack.io/call-rotations-best-wake-devs-middle-night/", "description": ""}, {"name": "Understanding The Role Of The Incident Manager On-Call (IMOC)", "url": "https://www.gremlin.com/community/tutorials/understanding-the-role-of-the-incident-manager-on-call-imoc/", "description": ""}, {"name": "3 Ways to Minimize the Impact of High Severity Incidents", "url": "https://devops.com/three-ways-to-minimize-the-impact-of-high-severity-incidents/", "description": ""}, {"name": "Advice to Management Teams While Enrolling Changes to On-Call Systems", "url": "https://thenewstack.io/advice-management-teams-enrolling-changes-on-call-systems/", "description": ""}, {"name": "Moving Past Shallow Incident Data", "url": "http://www.adaptivecapacitylabs.com/blog/2018/03/23/moving-past-shallow-incident-data/", "description": ""}, {"name": "Sustainable On-Call", "url": "https://codywilbourn.com/2018/03/22/sustainable-on-call/", "description": ""}, {"name": "dotScale 2017 - Aish Raj Dahal - Chaos management during a major incident", "url": "https://youtu.be/8pPrtf1J1Z8", "description": ""}, {"name": "Incident Management at Netflix Velocity", "url": "https://www.infoq.com/presentations/netflix-incident-management", "description": ""}, {"name": "Incidents, fixes, and the day after", "url": "https://medium.com/booking-com-infrastructure/incidents-fixes-and-the-day-after-c5d9aeae28c3", "description": ""}, {"name": "10 Steps to Develop an Incident Response Plan You\u2019ll ACTUALLY Use", "url": "https://engineering.salesforce.com/10-steps-to-develop-an-incident-response-plan-youll-actually-use-6cc49d9bf94c", "description": ""}, {"name": "Checklists: a stupidly simple but valuable operational gift", "url": "https://tech.buzzfeed.com/checklists-an-operational-gift-aaf42cf0be12", "description": ""}, {"name": "How to write a status page update", "url": "https://blog.hostedgraphite.com/2018/09/13/how-to-write-a-status-page-update/", "description": ""}, {"name": "Atlassian Incident Handbook", "url": "https://www.atlassian.com/software/jira/ops/handbook", "description": ""}, {"name": "PagerDuty Incident Response Handbook", "url": "https://response.pagerduty.com/", "description": ""}, {"name": "Avoiding Burnout for SREs", "url": "https://blog.zenduty.com/blog/2019/05/02/Avoiding-SRE-Burnout", "description": ""}, {"name": "Better On-Call the SRE way", "url": "https://vimeo.com/344516642", "description": ""}, {"name": "Managing Incidents at Monzo", "url": "https://www.youtube.com/watch?v=ZqwVlsIonIw", "description": ""}, {"name": "Making On-Call Not Suck", "url": "https://dev.to/molly_struve/making-on-call-not-suck-490", "description": ""}, {"name": "How we (Monzo) respond to incidents", "url": "https://monzo.com/blog/2019/07/08/how-we-respond-to-incidents", "description": ""}, {"name": "How we\u2019ve evolved on-call at Monzo", "url": "https://monzo.com/blog/how-weve-evolved-on-call-at-monzo", "description": ""}, {"name": "Code Yellow: When Operations Isn\u2019t Perfect", "url": "https://devops.com/code-yellow-when-operations-isnt-perfect/", "description": ""}, {"name": "MTTR is dead, long live CIRT", "url": "https://opensource.com/article/19/7/measure-operational-performance", "description": ""}, {"name": "Extended Dreyfus Model for Incident Lifecycles", "url": "https://github.com/preed/incident-lifecycle-model", "description": "", "stars": "24"}, {"name": "Inhumanity of Root Cause Analysis", "url": "https://www.verica.io/inhumanity-of-root-cause-analysis/", "description": ""}, {"name": "Incident insights from NASA, NTSB, and the CDC", "url": "https://www.youtube.com/watch?v=ODYO2MPymJ4", "description": ""}, {"name": "How to avoid On-Call Burnout the SRE Way", "url": "https://www.squadcast.com/blog/how-to-avoid-on-call-burnout", "description": ""}, {"name": "My week shadowing a GitLab Site Reliability Engineer", "url": "https://about.gitlab.com/blog/2019/12/16/sre-shadow/", "description": ""}, {"name": "How our production team runs the weekly on-call handover", "url": "https://about.gitlab.com/blog/2018/03/14/the-on-call-handover-at-gitlab/", "description": ""}, {"name": "Writing Runbook Documentation When You\u2019re An SRE", "url": "https://www.transposit.com/blog/2020.01.30-writing-runbook-documentation-when-youre-an-sre/", "description": ""}, {"name": "Incident response, programs and you(r startup)", "url": "https://lethain.com/incident-response-programs-and-your-startup/", "description": ""}, {"name": "An Incident Command Training Handbook", "url": "https://blog.danslimmon.com/2019/06/24/an-incident-command-training-handbook/", "description": ""}, {"name": "Shrinking the time to mitigate production incidents", "url": "https://cloud.google.com/blog/products/management-tools/shrinking-the-time-to-mitigate-production-incidents", "description": ""}, {"name": "Incident writeup as sociological storytelling", "url": "https://surfingcomplexity.blog/2021/06/11/incident-writeup-as-sociological-storytelling/", "description": ""}, {"name": "Elephant in the Blameless War Room: Accountability", "url": "https://www.blameless.com/incident-response/elephant-in-the-blameless-war-room-accountability", "description": ""}, {"name": "Naming names in incident writeups", "url": "https://surfingcomplexity.blog/2021/05/22/naming-names-in-incident-writeups/", "description": ""}, {"name": "Building On-Call Culture at GitHub", "url": "https://github.blog/2021-01-06-building-on-call-culture-at-github/", "description": ""}, {"name": "A collection of post-mortems", "url": "https://github.com/danluu/post-mortems", "description": "", "stars": "9.8k"}, {"name": "Collection of Kubernetes Failure Stories", "url": "https://github.com/hjacobs/kubernetes-failure-stories", "description": "", "stars": "6.2k"}, {"name": "Blameless PostMortems and a Just Culture", "url": "https://codeascraft.com/2012/05/22/blameless-postmortems/", "description": ""}, {"name": "A Tale of Postmortems", "url": "https://blog.box.com/blog/a-tale-of-postmortems/", "description": ""}, {"name": "Building a Blameless Post-Mortem Culture with Jason Hand", "url": "http://runasradio.com/Shows/Show/486", "description": ""}, {"name": "The infinite hows", "url": "https://www.oreilly.com/ideas/the-infinite-hows", "description": ""}, {"name": "Failure is Always An Option: How a Blameless Culture Leads to Better Results", "url": "https://victorops.com/blog/blameless-culture/", "description": ""}, {"name": "SysAdvent - Day 1 - Why You Need a Postmortem Process", "url": "https://sysadvent.blogspot.com/2016/12/day-1-why-you-need-postmortem-process.html", "description": ""}, {"name": "Etsy\u2019s Debriefing Facilitation Guide for Blameless Postmortems", "url": "https://codeascraft.com/2016/11/17/debriefing-facilitation-guide/", "description": ""}, {"name": "Writing Your First Postmortem", "url": "https://sharpend.io/writing-your-first-postmortem/", "description": ""}, {"name": "How to Write Great Outage Post-Mortems", "url": "https://artsy.github.io/blog/2014/11/19/how-to-write-great-outage-post-mortems/", "description": ""}, {"name": "A collection of postmortem templates", "url": "https://github.com/dastergon/postmortem-templates", "description": "", "stars": "971"}, {"name": "Embracing Feedback", "url": "https://blog.heptio.com/embracing-feedback-2fd703da714f", "description": ""}, {"name": "Postmortem Action Items: Plan the Work and Work the Plan", "url": "https://www.usenix.org/conference/srecon17americas/program/presentation/lueder", "description": ""}, {"name": "Social Issues In Postmortems", "url": "https://medium.com/@allspaw/social-issues-in-postmortems-d48dde624d18", "description": ""}, {"name": "Google Has an Official Process in Place for Learning From Failure--and It's Absolutely Brilliant", "url": "https://www.inc.com/justin-bariso/meet-postmortem-googles-brilliant-process-tool-for-learning-from-failure.html", "description": ""}, {"name": "Postmortem culture: how you can learn from failure", "url": "https://rework.withgoogle.com/blog/postmortem-culture-how-you-can-learn-from-failure/", "description": ""}, {"name": "re:Work - Postmortem discussion template", "url": "https://docs.google.com/document/d/1ob0dfG_gefr_gQ8kbKr0kS4XpaKbc0oVAk4Te9tbDqM/edit", "description": ""}, {"name": "Post-mortems to the rescue", "url": "https://increment.com/documentation/post-mortems-to-the-rescue/", "description": ""}, {"name": "Postmortem Action Items: Plan the Work and Work the Plan", "url": "https://ai.google/research/pubs/pub45906", "description": ""}, {"name": "Why Every Company Can Benefit from a Blameless Culture", "url": "https://www.blameless.com/why-companies-can-benefit-from-blameless-culture/", "description": ""}, {"name": "\"It's dead, Jim\": How we write an incident postmortem", "url": "https://www.hostedgraphite.com/blog/its-dead-jim-how-we-write-an-incident-postmortem", "description": ""}, {"name": "Our incident postmortem template", "url": "https://www.hostedgraphite.com/blog/incident-postmortem-template", "description": ""}, {"name": "Learn out of mistakes. Postmortems to the rescue.", "url": "https://fernandocejas.com/2020/03/21/learn-out-of-mistakes-postmortems/", "description": ""}, {"name": "Improving Postmortem Practices with Veteran Google SRE, Steve McGhee", "url": "https://www.blameless.com/improve-postmortem-with-sre-steve-mcghee/", "description": ""}, {"name": "Inhumanity of Root Cause Analysis", "url": "https://www.verica.io/blog/inhumanity-of-root-cause-analysis/", "description": ""}, {"name": "Capacity Planning", "url": "https://www.usenix.org/system/files/login/articles/login_feb15_07_hixson.pdf", "description": ""}, {"name": "SouthBay SRE: Cloud Capacity Planning", "url": "https://www.youtube.com/watch?v=MDQ0uEUmLOo", "description": ""}, {"name": "Intent-based Capacity Planning and Autoscaling with Kubernetes", "url": "https://www.squadcast.com/blog/intent-based-capacity-planning-and-autoscaling-with-kubernetes", "description": ""}, {"name": "How do you do Capacity Planning", "url": "https://jvns.ca/blog/2016/03/20/how-do-you-do-capacity-planning/", "description": ""}, {"name": "How Back Market SREs prepared for Black Friday", "url": "https://medium.com/back-market-engineering/how-back-market-sres-prepared-for-black-friday-5f017f343408", "description": ""}, {"name": "If It's in the Cloud, Get It on Paper: Cloud Computing Contract Issues", "url": "http://er.educause.edu/articles/2010/6/if-its-in-the-cloud-get-it-on-paper-cloud-computing-contract-issues", "description": ""}, {"name": "Service Level Agreements in the Cloud: Who cares?", "url": "http://www.wired.com/insights/2011/12/service-level-agreements-in-the-cloud-who-cares/", "description": ""}, {"name": "SysAdvent- Day 20 - How to set and monitor SLAs", "url": "https://sysadvent.blogspot.com/2016/12/day-20-how-to-set-and-monitor-slas.html", "description": ""}, {"name": "SLOs, SLIs, SLAs, oh my - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/01/availability-part-deux--CRE-life-lessons.html", "description": ""}, {"name": "Service Levels and Error Budgets", "url": "https://www.usenix.org/conference/srecon16/program/presentation/jones", "description": ""}, {"name": "(Un)Reliability Budgets - Finding Balance between Innovation and Reliability", "url": "https://www.usenix.org/system/files/login/articles/login_aug15_06_roth.pdf", "description": ""}, {"name": "The Calculus of Service Availability", "url": "https://queue.acm.org/detail.cfm?id=3096459&__s=dnkxuaws9pogqdnxmx8i", "description": ""}, {"name": "Availability Calculator: Calculate how much downtime should be permitted in your SLA", "url": "https://dastergon.github.io/availability-calculator/", "description": ""}, {"name": "Standardize cloud SLA availability with numerical performance data", "url": "https://www.ibm.com/developerworks/cloud/library/cl-SLAloadbalance-numanalysis/", "description": ""}, {"name": "Best practices to develop SLAs for cloud computing", "url": "https://www.ibm.com/developerworks/cloud/library/cl-slastandards/", "description": ""}, {"name": "A Practical Guide to SLAs", "url": "https://www.catchpoint.com/blog/sla-management-guide/", "description": ""}, {"name": "Building good SLOs - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2017/10/building-good-SLOs-CRE-life-lessons.html", "description": ""}, {"name": "No Grumpy Humans and Other Site Reliability Engineering Lessons from Google", "url": "https://thenewstack.io/sre-lessons-google-no-grumpy-humans/", "description": ""}, {"name": "Consequences of SLO violations \u2014 CRE life lessons", "url": "https://cloudplatform.googleblog.com/2018/01/consequences-of-SLO-violations-CRE-life-lessons.html", "description": ""}, {"name": "Service Level Objectives in Practice", "url": "https://medium.com/@jerub/service-level-objectives-in-practice-ed1200502d5", "description": ""}, {"name": "SRE Consensus Building", "url": "https://medium.com/@jerub/sre-consensus-building-36ad5d2e470b", "description": ""}, {"name": "An example escalation policy \u2014 CRE life lessons", "url": "https://cloudplatform.googleblog.com/2018/01/an-example-escalation-policy-CRE-life-lessons.html", "description": ""}, {"name": "Error Budget Calculator", "url": "https://dastergon.gr/error-budget-calculator/", "description": ""}, {"name": "Understanding error budget overspend - part one - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2018/06/understanding-error-budget-overspend-cre-life-lessons.html", "description": ""}, {"name": "Good housekeeping for error budgets - part two - CRE life lessons", "url": "https://cloudplatform.googleblog.com/2018/06/cre-life-lessons-good-housekeeping-for-error-budgets.html", "description": ""}, {"name": "SRE fundamentals: SLIs, SLAs and SLOs", "url": "https://cloudplatform.googleblog.com/2018/07/sre-fundamentals-slis-slas-and-slos.html", "description": ""}, {"name": "SLOs & You: A Guide To Service Level Objectives", "url": "https://www.circonus.com/2018/07/a-guide-to-service-level-objectives/", "description": ""}, {"name": "Earning Our Wings: Stories and Findings From Operating a Large-scale Concourse Deployment", "url": "https://medium.com/concourse-ci/earning-our-wings-a0c307fa73e6", "description": ""}, {"name": "Nines are Not Enough: Meaningful Metrics for Clouds", "url": "https://ai.google/research/pubs/pub48033", "description": ""}, {"name": "How many nines is my storage system?", "url": "https://medium.com/@jamesacowling/how-many-nines-is-my-storage-system-7d16e852d56d", "description": ""}, {"name": "Don't follow the sun.", "url": "https://lethain.com/dont-follow-the-sun/", "description": ""}, {"name": "The Tyranny of the SLA", "url": "https://www.youtube.com/watch?v=4cPqLuIXBnw", "description": ""}, {"name": "Backblaze Durability is 99.999999999% \u2014 And Why It Doesn\u2019t Matter", "url": "https://www.backblaze.com/blog/cloud-storage-durability/", "description": ""}, {"name": "DevOpsDays Chicago 2019 - The Art of SLOs", "url": "https://youtu.be/Dfnbw5dJQ5I", "description": ""}, {"name": "The Art of SLOs Workshop Materials", "url": "https://cre.page.link/art-of-slos", "description": ""}, {"name": "How to Include Latency in SLO-Based Alerting", "url": "https://grafana.com/blog/2019/11/27/kubecon-recap-how-to-include-latency-in-slo-based-alerting/", "description": ""}, {"name": "Succeeding With Service Level Objectives", "url": "https://www.squadcast.com/blog/succeeding-with-service-level-objectives", "description": ""}, {"name": "Putting customers first with SLIs and SLOs", "url": "https://medium.com/the-telegraph-engineering/putting-customers-first-with-slis-and-slos-15352f9b6cbc", "description": ""}, {"name": "SRE Leadership: Have Tiered SLAs", "url": "https://medium.com/site-reliability-engineering-leadership/sre-tip-have-tiered-slas-2c432ffe46a", "description": ""}, {"name": "How SLOs Enable Fast, Reliable Application Delivery", "url": "https://www.blameless.com/blog/how-slos-enable-fast-reliable-application-delivery", "description": ""}, {"name": "The Tail at Scale", "url": "https://billduncan.org/the-tail-at-scale/", "description": ""}, {"name": "The Tail at Scale Revisited", "url": "https://billduncan.org/the-tail-at-scale-revisited/", "description": ""}, {"name": "Defining SLOs for services with dependencies", "url": "https://cloud.google.com/blog/products/gcp/defining-slos-for-services-with-dependencies-cre-life-lessons", "description": ""}, {"name": "Service Level Disagreements", "url": "https://blog.b3k.us/2009/07/15/service-level-disagreements.html", "description": ""}, {"name": "How We Use Sloth to do SLO Monitoring and Alerting with Prometheus", "url": "https://mattermost.com/blog/sloth-for-slo-monitoring-and-alerting-with-prometheus/", "description": ""}, {"name": "SLI Deep Dive", "url": "https://medium.com/site-reliability-engineering-leadership/sli-deep-dive-cae92bd90a79", "description": ""}, {"name": "Measuring Reliability in GCP: Step By Step SLO creation guide using Cloud Operation Sandbox", "url": "https://medium.com/google-cloud/measuring-reliability-in-gcp-step-by-step-slo-creation-guide-using-cloud-operation-sandbox-99043bd0e70f", "description": ""}, {"name": "SLO tracker", "url": "https://slotracker.com/", "description": ""}, {"name": "SLO Alerting for Mortals", "url": "https://ervinbarta.com/2021/10/19/slo-alerting-for-mortals/", "description": ""}, {"name": "SRE methods and climate change", "url": "https://bpetit.nce.re/2021/03/sre-methods-and-climate-change/", "description": ""}, {"name": "What made SLOs so messy (and what we can do about it)", "url": "https://medium.com/lightstephq/what-made-slos-so-messy-and-what-we-can-do-about-it-89be415a80b3", "description": ""}, {"name": "SLICK: Adopting SLOs for improved reliability", "url": "https://engineering.fb.com/2021/12/13/production-engineering/slick/", "description": ""}, {"name": "Calculating composite SLA", "url": "https://alexewerlof.medium.com/calculating-composite-sla-d855eaf2c655", "description": ""}, {"name": "Best practices for setting SLOs and SLIs for modern, complex systems", "url": "https://newrelic.com/blog/best-practices/best-practices-for-setting-slos-and-slis-for-modern-complex-systems", "description": ""}, {"name": "Performance Checklists for SREs", "url": "https://www.brendangregg.com/blog/2016-05-04/srecon2016-perf-checklists-for-sres.html", "description": ""}, {"name": "South Bay SRE Meetup - Netflix Cloud Performance Team", "url": "https://youtu.be/uQ0flQOtQEA", "description": ""}, {"name": "Software Performance Analysis Guided By SLOs", "url": "https://medium.com/dm03514-tech-blog/sre-performance-analysis-tuning-methodology-using-a-simple-http-webserver-in-go-d475460f27ca", "description": ""}, {"name": "A framework for pragmatic performance engineering", "url": "https://mterwill.com/posts/framework-for-performance-engineering/", "description": ""}, {"name": "Go Language for Ops and Site Reliability Engineering", "url": "http://www.oreilly.com/pub/e/2712", "description": ""}, {"name": "Go for SREs using Python", "url": "https://www.usenix.org/sites/default/files/conference/protected-files/srecon16_slides_hamilton.pdf", "description": ""}, {"name": "Operability in Go", "url": "https://speakerdeck.com/ianschenck/operability-in-go", "description": ""}, {"name": "Go Reliability and Durability at Dropbox", "url": "https://www.youtube.com/watch?v=5doOcaMXx08", "description": ""}, {"name": "What is SRE (Site Reliability Engineering)?", "url": "https://www.oreilly.com/ideas/what-is-sre-site-reliability-engineering", "description": ""}, {"name": "Here\u2019s How Google Makes Sure It (Almost) Never Goes Down", "url": "http://www.wired.com/2016/04/google-ensures-services-almost-never-go/", "description": ""}, {"name": "Are site reliability engineers the next data scientists?", "url": "http://techcrunch.com/2016/03/02/are-site-reliability-engineers-the-next-data-scientists/", "description": ""}, {"name": "Site Reliability Engineers: \"solving the most interesting problems\"", "url": "http://googleresearch.blogspot.gr/2012/07/site-reliability-engineers-solving-most.html", "description": ""}, {"name": "Site Reliability Engineers: the \"world\u2019s most intense pit crew\"", "url": "http://googleforstudents.blogspot.gr/2012/06/site-reliability-engineers-worlds-most.html", "description": ""}, {"name": "Site reliability engineering kicks rote tasks out of IT ops", "url": "http://searchitoperations.techtarget.com/feature/Site-reliability-engineering-kicks-rote-tasks-out-of-IT-ops", "description": ""}, {"name": "Notes on Site Reliability Engineering", "url": "http://danluu.com/google-sre-book/", "description": ""}, {"name": "Adventures in SRE-land: Welcome to Google Mission Control", "url": "https://cloudplatform.googleblog.com/2016/07/adventures-in-SRE-land-welcome-to-Google-Mission-Control.html", "description": ""}, {"name": "Book Review: Site Reliability Engineering - How Google Runs Production Systems", "url": "https://www.infoq.com/articles/site-reliability-engineering", "description": ""}, {"name": "Site Reliability Engineers: \u201cWe solve cooler problems\u201d", "url": "https://www.google.com/about/careers/stories/site-reliability-engineering-profile-google/", "description": ""}, {"name": "SREcon17: Brave new world of site reliability engineering", "url": "http://www.networkworld.com/article/3182827/cloud-computing/srecon17-brave-new-world-of-site-reliability-engineering.html", "description": ""}, {"name": "Open AWS guide", "url": "https://github.com/open-guides/og-aws", "description": "", "stars": "33k"}, {"name": "Commentary on Site Reliability Engineering", "url": "https://medium.com/@jerub/commentary-on-site-reliability-engineering-9ba9e1be2a8c", "description": ""}, {"name": "Site Reliability Engineering: 4 Things to Know", "url": "https://www.networkcomputing.com/data-centers/site-reliability-engineering-4-things-know/888724300", "description": ""}, {"name": "Looking for SRE Success? Then Find the Intrapreneurs!", "url": "https://www.linkedin.com/pulse/looking-sre-success-find-intrapreneurs-josh-gilliland/", "description": ""}, {"name": "What Team Structure is Right for DevOps to Flourish?", "url": "http://web.devopstopologies.com/", "description": ""}, {"name": "Injured on Vacation? Applying Principles from Site Reliability Engineering to a Travel Emergency", "url": "https://www.sidewalksafari.com/2018/12/sre-in-a-travel-emergency.html", "description": ""}, {"name": "Building blameless working environment", "url": "https://sobolevn.me/2018/12/blameless-environment", "description": ""}, {"name": "SRE Adoption Report", "url": "https://techbeacon.com/devops/how-accenture-retrofitted-site-reliability-engineering", "description": ""}, {"name": "SREs: The Happiest \u2013 and Highest Paid \u2013 in the Industry", "url": "https://devops.com/sres-the-happiest-and-highest-paid-in-the-industry/", "description": ""}, {"name": "The Role of Site Reliability Engineering, Today and Tomorrow", "url": "https://thenewstack.io/the-role-of-site-reliability-engineering-today-and-tomorrow/", "description": ""}, {"name": "SRE as a Lifestyle Choice", "url": "https://medium.com/@bellmar/sre-as-a-lifestyle-choice-de9f5a82d73d", "description": ""}, {"name": "SRECon EMEA 2019 Recap", "url": "https://speakerdeck.com/dastergon/srecon-emea-2019-recap-sre-muc-meetup", "description": ""}, {"name": "Life of an SRE at Google - JC van Winkel", "url": "https://www.youtube.com/watch?v=7Oe8mYPBZmw", "description": ""}, {"name": "Site Reliability Engineering for Native Mobile Apps - Abhijith Krishnappa", "url": "https://www.infoq.com/articles/site-reliability-engineering-mobile-apps/", "description": "Case study: Halodoc adaptation of SRE principles for Native Mobile Apps"}, {"name": "SRE Best Practices by InfraCloud", "url": "https://www.infracloud.io/blogs/sre-best-practices/", "description": ""}, {"name": "#sre channel at Hangops Slack", "url": "https://hangops.slack.com/", "description": "Discussion of Site Reliability Engineering generally."}, {"name": "#incident\\_response channel at Hangops Slack", "url": "https://hangops.slack.com/", "description": "Discussion about Incident Response."}, {"name": "USENIX SREcon Slack", "url": "https://usenix-srecon.slack.com", "description": ""}, {"name": "Brendan Gregg's Blog", "url": "http://www.brendangregg.com/blog/index.html", "description": "Highly Technical Blog Posts About Systems Internals, Performance and SRE."}, {"name": "Everything Sysadmin", "url": "http://everythingsysadmin.com/", "description": "Blog Posts About SysAdmin/DevOps/SRE by Tom Limoncelli."}, {"name": "High Scalability", "url": "http://highscalability.com/", "description": "Technical Blog Posts About Systems Architecture."}, {"name": "rachelbythebay", "url": "https://rachelbythebay.com/w/", "description": "Techincal Blog Posts."}, {"name": "Susan J. Fowler", "url": "http://www.susanjfowler.com/blog/", "description": "Various blog posts about SRE, Software Engineering and Microservices."}, {"name": "SysAdvent", "url": "https://sysadvent.blogspot.com", "description": "One article for each day of December, ending on the 25th article."}, {"name": "Stephen Thorne's Blog", "url": "https://medium.com/@jerub", "description": "Blog Posts About SRE"}, {"name": "Increment", "url": "https://increment.com/", "description": "A digital magazine about how teams build and operate software systems at scale."}, {"name": "GopherSRE", "url": "http://www.gophersre.com/", "description": "Blog Posts about Go and SRE."}, {"name": "Cindy Sridharan", "url": "https://medium.com/@copyconstruct", "description": "Blog posts about distributed systems and their management."}, {"name": "Blameless Blog", "url": "https://www.blameless.com/blog/", "description": "Blog posts about SRE culture and practices."}, {"name": "Resilience Roundup", "url": "https://ResilienceRoundup.com", "description": "Weekly analysis of Resilience Engineering and Human Factors research designed for software systems"}, {"name": "Squadcast Blog", "url": "https://www.squadcast.com/blog", "description": "Blog posts about SRE best practices, reliability, on-call and incident management."}, {"name": "FireHydrant Blog", "url": "https://www.firehydrant.io/blog", "description": "Posts about complex systems, incident response, and SRE best practices."}, {"name": "Rootly Blog", "url": "https://www.rootly.io/blog", "description": "Incident management best practices and guides."}, {"name": "incident.io Blog", "url": "https://www.incident.io/blog", "description": "Guides, advice and resources on incident management and response."}, {"name": "Logit.io Blog", "url": "https://logit.io/blog", "description": "Resources on log management, SRE and devOps."}, {"name": "DevOpsLinks", "url": "https://faun.dev", "description": "A weekly newsletter about SRE, SysAdmin and DevOps news, tools, tutorials and opinions."}, {"name": "KubeWeekly", "url": "https://kubeweekly.io/", "description": "The weekly newsletters for all things Kubernetes. KubeWeekly is curated by Bob Killen, Chris Short, Craig Box, Kim McMahon and Michael Hausenblas"}, {"name": "SRE Weekly", "url": "https://sreweekly.com/", "description": "Weekly Site Reliability Newsletter."}, {"name": "O\u2019Reilly Systems Engineering and Operations Newsletter", "url": "http://www.oreilly.com/webops-perf/newsletter.html", "description": "Weekly systems engineering and operations news and insights from industry insiders."}, {"name": "ChaosEngineering.news", "url": "https://chaosengineering.news/", "description": "Chaos Engineering newsletter. All things Chaos Engineering, directly to your inbox!"}, {"name": "Monitoring Weekly", "url": "https://monitoring.love/", "description": "What's new in monitoring? Curated monitoring articles to your inbox each week."}, {"name": "Observability news", "url": "https://o11y.news/", "description": "Updates around observability (o11y) with a special focus on open source."}, {"name": "SRECon Conferences", "url": "https://www.usenix.org/conferences/byname/925", "description": "The Official SRE Conference."}, {"name": "LISA Conferences", "url": "https://www.usenix.org/conferences/byname/5", "description": "Prominent Conference About SysAdmin/DevOps/SRE."}, {"name": "SRE Tech Talks", "url": "https://developers.google.com/events/sre/", "description": "SRE Talks Hosted by Google."}, {"name": "South Bay Site Reliability Engineering (Sunnyvale, CA) Meetup", "url": "https://www.meetup.com/South-Bay-Site-Reliability-Engineering/", "description": "A Group For Individuals Who Tackle Reliability Challenges For Web-Scale Systems."}, {"name": "San Francisco Reliability Engineering", "url": "https://www.meetup.com/San-Francisco-Reliability-Engineering/", "description": "A Group Of People Who Are Passionate About Reliable, Performant Software Systems."}, {"name": "Site Reliability Engineering Munich, Germany", "url": "https://www.meetup.com/Site-Reliability-Engineering-Munich/", "description": "SRE Meetup in the greater area of Oktoberfest city."}, {"name": "ADDO - All Day DevOps", "url": "https://www.alldaydevops.com/", "description": "A 24 hour conference that is completely online and free."}, {"name": "Site Reliability Engineering Paris, France", "url": "https://www.meetup.com/Site-Reliability-Engineering-Paris/", "description": "SRE Meetup in the city of light."}, {"name": "Site Reliability Engineering India", "url": "https://www.meetup.com/site-reliability-enggineering/", "description": "SRE Meetup India"}, {"name": "Google SRE Twitter Account", "url": "https://twitter.com/googlesre", "description": "Google's SRE Twitter Account."}, {"name": "SREBook", "url": "https://twitter.com/SREBook", "description": "The Official Twitter Account of Site Reliability Engineering Book."}, {"name": "SREcon", "url": "https://twitter.com/SREcon", "description": "SRECon's Official Twitter Account."}, {"name": "SREWorkbook", "url": "https://twitter.com/SREWorkbook", "description": "The Official Twitter Account of Site Reliability Workbook."}, {"name": "The SRE Dev", "url": "https://twitter.com/The_SRE_Dev", "description": "SRE-related Posts from [dev.to](https://dev.to)."}, {"name": "Twitter SRE", "url": "https://twitter.com/TwitterSRE", "description": "The Official Twitter Account of Twitter's SRE team."}, {"name": "Twitter SRE Weekly", "url": "https://twitter.com/SREWeekly", "description": "The Official Twitter Account of SRE Weekly Newsletter."}, {"name": "USENIX Association", "url": "https://twitter.com/usenix", "description": "The Official USENIX Twitter Account."}, {"name": "Awesome SRE Tools", "url": "https://github.com/SquadcastHub/awesome-sre-tools", "description": "A curated list of Site Reliability and Production Engineering tools", "stars": "617"}, {"name": "List of Continuous Integration services", "url": "https://github.com/ligurio/awesome-ci", "description": "", "stars": "3k"}, {"name": "SRE cheat sheet", "url": "https://github.com/shibumi/SRE-cheat-sheet", "description": "A cheat sheet for Site Reliability Engineering principles and numbers", "stars": "116"}, {"name": "Blameless / Resilience in Action", "url": "https://podcasts.apple.com/us/podcast/resilience-in-action/id1506828506", "description": ""}, {"name": "Google SRE Prodcast", "url": "https://sre.google/prodcast", "description": ""}, {"name": "o11y Observability Podcast", "url": "https://www.honeycomb.io/usecase/o11ycast/", "description": ""}, {"name": "On Call Nightmares (retired)", "url": "https://podcasts.apple.com/us/podcast/on-call-nightmares-podcast/id1447430839", "description": ""}, {"name": "Making of the SRE Omelette", "url": "https://open.spotify.com/show/1KxLVUduNdDRAiOw8BB32J", "description": ""}]}], "name": ""} |