[{"data":1,"prerenderedAt":297},["ShallowReactive",2],{"doc-\u002Fsecurity\u002Fincident-response":3},{"id":4,"title":5,"body":6,"description":287,"edit":288,"extension":289,"meta":290,"navigation":291,"path":292,"seo":293,"stem":294,"vertical":288,"weight":295,"__hash__":296},"content\u002Fsecurity\u002Fincident-response.md","Incident response",{"type":7,"value":8,"toc":271},"minimark",[9,13,18,21,26,39,42,46,57,60,64,75,79,146,149,154,157,160,164,198,202,213,217,237,241,244,264,268],[10,11,12],"p",{},"This page documents what OpenSense does when an incident occurs, and\nwhat we ask of customers in return. It is short on purpose.",[14,15,17],"h2",{"id":16},"what-counts-as-an-incident","What counts as an incident",[10,19,20],{},"Three categories, with different response classes.",[22,23,25],"h3",{"id":24},"p0-customer-data-loss-or-exposure","P0 — customer data loss or exposure",[27,28,29,33,36],"ul",{},[30,31,32],"li",{},"Customer data is exposed to a party that should not have it.",[30,34,35],{},"Customer data is irrecoverably lost.",[30,37,38],{},"The audit trail's integrity is compromised in a way that cannot\nbe silently corrected.",[10,40,41],{},"Examples: a database leak, a backup that turned out unrecoverable\nduring a real restore, an audit-log row found to have been silently\nedited.",[22,43,45],{"id":44},"p1-ingest-or-alarm-path-down","P1 — ingest or alarm path down",[27,47,48,51,54],{},[30,49,50],{},"Ingest endpoint returning 5xx for > 5 minutes.",[30,52,53],{},"Alarm dispatch paths (Telegram, email) failing for > 5 minutes.",[30,55,56],{},"Dashboard down for > 15 minutes.",[10,58,59],{},"Examples: Postgres unreachable, Postmark account suspended,\nHetzner DC network partition.",[22,61,63],{"id":62},"p2-degraded-but-functional","P2 — degraded but functional",[27,65,66,69,72],{},[30,67,68],{},"Slow ingest (p99 latency > 2 s).",[30,70,71],{},"Slow report rendering (> 2 min for a monthly).",[30,73,74],{},"Single device's ingest token mis-issued; one customer affected.",[14,76,78],{"id":77},"response-timelines","Response timelines",[80,81,82,101],"table",{},[83,84,85],"thead",{},[86,87,88,92,95,98],"tr",{},[89,90,91],"th",{},"Class",[89,93,94],{},"Acknowledgement",[89,96,97],{},"Status update cadence",[89,99,100],{},"Resolution target",[102,103,104,119,132],"tbody",{},[86,105,106,110,113,116],{},[107,108,109],"td",{},"P0",[107,111,112],{},"30 minutes",[107,114,115],{},"Hourly",[107,117,118],{},"Best effort, with full disclosure post-resolution",[86,120,121,124,126,129],{},[107,122,123],{},"P1",[107,125,112],{},[107,127,128],{},"Every 60 minutes",[107,130,131],{},"8 business hours",[86,133,134,137,140,143],{},[107,135,136],{},"P2",[107,138,139],{},"4 business hours",[107,141,142],{},"Once per business day",[107,144,145],{},"5 business days",[10,147,148],{},"Acknowledgement: we publicly confirm we know about it (status page",[27,150,151],{},[30,152,153],{},"email to affected customers).",[10,155,156],{},"Status update cadence: even when there is \"no new news\", we post\n\"still investigating\" at the cadence above.",[10,158,159],{},"Resolution: the incident is closed and a post-mortem is published.",[14,161,163],{"id":162},"communication-channels","Communication channels",[27,165,166,178,188],{},[30,167,168,172,173,177],{},[169,170,171],"strong",{},"Status page"," (",[174,175,176],"code",{},"opensense.murzin.digital\u002Fstatus",") — the\none-second health check; we post incident updates here first.",[30,179,180,183,184,187],{},[169,181,182],{},"Affected-customer email"," — direct to operators on file, from\n",[174,185,186],{},"alerts@",". Sent from infrastructure independent of the SaaS, so\nit survives outages of the main service.",[30,189,190,193,194,197],{},[169,191,192],{},"Public post-mortem"," — within 7 business days of resolution,\nfor any P0 or any P1 lasting > 1 h. Posted at\n",[174,195,196],{},"\u002Fblog\u002Fpostmortem\u002F\u003Cdate-slug>"," on the main site.",[14,199,201],{"id":200},"what-we-ask-of-customers","What we ask of customers",[27,203,204,207,210],{},[30,205,206],{},"Keep an up-to-date email on file. We will not phone you; we will\nemail.",[30,208,209],{},"Optionally, opt in to Telegram for incident notices. The\nTelegram path is faster than email for the operator's situational\nawareness.",[30,211,212],{},"Be patient at the start. The first 30 minutes are mostly\ndiagnosis. We will not invent a story to fill the silence.",[14,214,216],{"id":215},"what-we-will-not-do-during-an-incident","What we will not do during an incident",[27,218,219,225,231],{},[30,220,221,224],{},[169,222,223],{},"Delete or edit historical data to fix the problem."," Audit-log\nintegrity is a hard rule.",[30,226,227,230],{},[169,228,229],{},"Blame customers."," If our ingest rejected a payload because the\ncustomer's payload was wrong, we will say \"the ingest rejected\nthis payload because X\" — that is a fact, not blame. We will not\nfabricate fault.",[30,232,233,236],{},[169,234,235],{},"Mark \"resolved\" before it is."," We would rather have a long\nopen incident than a closed one that recurs.",[14,238,240],{"id":239},"pre-incident-hygiene","Pre-incident hygiene",[10,242,243],{},"We do the boring things because they pay off in the incident:",[27,245,246,255,258,261],{},[30,247,248,249,254],{},"Postgres dumps hourly. Tested restore monthly (target — see\n",[250,251,253],"a",{"href":252},"\u002Fsecurity\u002Farchitecture","architecture","; we acknowledge the gap).",[30,256,257],{},"All deployments roll back with a single command.",[30,259,260],{},"Schema migrations are reviewed by a second pair of eyes (today,\nthe founder reviews their own work, which is suboptimal; this is\none of the reasons we are hiring a second engineer in 2027).",[30,262,263],{},"Audit-log heads are published daily; even a P0 cannot silently\nrewrite history without that being externally visible.",[14,265,267],{"id":266},"past-incidents","Past incidents",[10,269,270],{},"(Nothing yet to disclose. This section will accumulate. We will\nnot pretend we never have an incident; we will name them when they\noccur.)",{"title":272,"searchDepth":273,"depth":273,"links":274},"",3,[275,281,282,283,284,285,286],{"id":16,"depth":276,"text":17,"children":277},2,[278,279,280],{"id":24,"depth":273,"text":25},{"id":44,"depth":273,"text":45},{"id":62,"depth":273,"text":63},{"id":77,"depth":276,"text":78},{"id":162,"depth":276,"text":163},{"id":200,"depth":276,"text":201},{"id":215,"depth":276,"text":216},{"id":239,"depth":276,"text":240},{"id":266,"depth":276,"text":267},"What we do when something goes wrong",null,"md",{},true,"\u002Fsecurity\u002Fincident-response",{"title":5,"description":287},"security\u002Fincident-response",550,"4ivLw8VjQwMjJKd31QztYvb1jYajxwiDbCRcogiK3iA",1779022956166]