| 1 | """ |
| 2 | oversight_core.decoy |
| 3 | =================== |
| 4 | |
| 5 | LLM-powered decoy document generator. |
| 6 | |
| 7 | Generates N plausible-looking decoy files that sit alongside real sensitive |
| 8 | content. Every decoy is sealed for a "trap" recipient whose beacons all fire |
| 9 | when accessed. Any open of a decoy is a high-confidence signal of intrusion - |
| 10 | no legitimate user should touch them, because the decoys are filenames |
| 11 | engineered to be interesting to an attacker browsing. |
| 12 | |
| 13 | This is the Thinkst canary pattern applied at scale with LLM-generated |
| 14 | realism. Recent research (SPADE 2025, HoneyGPT) shows this is an open area |
| 15 | with no strong commercial shipment. |
| 16 | |
| 17 | Backend options (pick via `backend` arg or OVERSIGHT_DECOY_BACKEND env): |
| 18 | - "ollama" - POST to a local Ollama server (recommended) |
| 19 | - "openai" - OpenAI-compatible API (for testing) |
| 20 | - "static" - hardcoded templates (works offline; lowest quality) |
| 21 | |
| 22 | Override the Ollama endpoint and model with the ``OLLAMA_URL`` and |
| 23 | ``OVERSIGHT_DECOY_MODEL`` environment variables. Defaults target a |
| 24 | loopback Ollama install. |
| 25 | """ |
| 26 | |
| 27 | from __future__ import annotations |
| 28 | |
| 29 | import json |
| 30 | import os |
| 31 | import random |
| 32 | from dataclasses import dataclass |
| 33 | from typing import Optional |
| 34 | |
| 35 | import httpx |
| 36 | |
| 37 | |
| 38 | DEFAULT_OLLAMA = os.environ.get("OLLAMA_URL", "http://127.0.0.1:11434") |
| 39 | DEFAULT_MODEL = os.environ.get("OVERSIGHT_DECOY_MODEL", "llama3.1:8b") |
| 40 | |
| 41 | |
| 42 | DEFAULT_DECOY_NAMES = [ |
| 43 | "Q4-board-deck-FINAL-v3.docx", |
| 44 | "acquisition-targets-2026.xlsx", |
| 45 | "legal-hold-privileged.pdf", |
| 46 | "compensation-bands-confidential.xlsx", |
| 47 | "incident-response-playbook-internal.docx", |
| 48 | "vendor-contracts-summary.pdf", |
| 49 | "cto-1on1-notes.docx", |
| 50 | "layoff-planning-tier1.xlsx", |
| 51 | "customer-churn-risk-2026.xlsx", |
| 52 | "M&A-pipeline-confidential.pptx", |
| 53 | "security-audit-findings-Q3.pdf", |
| 54 | "api-keys-rotation-plan.txt", |
| 55 | "lawsuit-draft-settlement.docx", |
| 56 | "executive-bonus-structure.xlsx", |
| 57 | "strategic-partnership-nda-drafts.pdf", |
| 58 | ] |
| 59 | |
| 60 | |
| 61 | DECOY_SYSTEM_PROMPT = """You are a corporate document generator for a security |
| 62 | research system. You produce plausible-looking but entirely fictional business |
| 63 | documents that will be used as decoys in an intrusion-detection system. All |
| 64 | names, numbers, and claims must be invented - never use real company names, |
| 65 | real people, or real data. The goal is realism of form, not content. |
| 66 | |
| 67 | Rules: |
| 68 | - All dollar figures are fake. |
| 69 | - All people are fictional (use generic names like "A. Smith", "J. Chen"). |
| 70 | - All company names are fake (use "Acme Industries", "Meridian Partners"). |
| 71 | - Avoid dates in the near past (the document should look "current" as of 2026). |
| 72 | - Tone: dry, corporate, slightly bureaucratic. No irony. |
| 73 | - Length: 250-600 words for text documents. |
| 74 | """ |
| 75 | |
| 76 | |
| 77 | @dataclass |
| 78 | class DecoyRequest: |
| 79 | """A request to generate one decoy.""" |
| 80 | filename: str |
| 81 | topic_hint: str |
| 82 | context: Optional[str] = None |
| 83 | |
| 84 | |
| 85 | def _prompt_for(req: DecoyRequest) -> str: |
| 86 | ctx = f"\nOrganizational context: {req.context}" if req.context else "" |
| 87 | return ( |
| 88 | f"Produce a realistic but entirely fictional document that would " |
| 89 | f"plausibly be saved as the filename '{req.filename}'. The topic is: " |
| 90 | f"{req.topic_hint}.{ctx}\n\n" |
| 91 | f"Write the full document body. No preamble, no meta-commentary. " |
| 92 | f"Begin the document directly." |
| 93 | ) |
| 94 | |
| 95 | |
| 96 | def _topic_from_filename(name: str) -> str: |
| 97 | """Heuristic: guess topic from filename when not otherwise specified.""" |
| 98 | n = name.lower() |
| 99 | if "board" in n or "deck" in n: |
| 100 | return "quarterly board meeting update" |
| 101 | if "acquisition" in n or "m&a" in n or "pipeline" in n: |
| 102 | return "shortlist of acquisition targets with preliminary valuations" |
| 103 | if "legal" in n or "lawsuit" in n: |
| 104 | return "legal memo with privileged work-product notation" |
| 105 | if "comp" in n or "bonus" in n or "bands" in n: |
| 106 | return "executive compensation band summary" |
| 107 | if "incident" in n or "playbook" in n: |
| 108 | return "internal incident response playbook" |
| 109 | if "audit" in n or "findings" in n: |
| 110 | return "internal security audit findings summary" |
| 111 | if "api" in n or "key" in n: |
| 112 | return "API key rotation plan with endpoint references" |
| 113 | if "layoff" in n: |
| 114 | return "workforce reduction planning notes" |
| 115 | if "churn" in n: |
| 116 | return "customer churn risk analysis" |
| 117 | if "partnership" in n or "nda" in n: |
| 118 | return "strategic partnership NDA draft negotiation notes" |
| 119 | if "1on1" in n or "notes" in n: |
| 120 | return "executive one-on-one meeting notes" |
| 121 | if "vendor" in n or "contract" in n: |
| 122 | return "vendor contract summary with renewal dates" |
| 123 | return "internal business memo" |
| 124 | |
| 125 | |
| 126 | |
| 127 | def _generate_ollama( |
| 128 | req: DecoyRequest, |
| 129 | ollama_url: str = DEFAULT_OLLAMA, |
| 130 | model: str = DEFAULT_MODEL, |
| 131 | timeout: float = 120.0, |
| 132 | ) -> str: |
| 133 | prompt = _prompt_for(req) |
| 134 | r = httpx.post( |
| 135 | f"{ollama_url.rstrip('/')}/api/generate", |
| 136 | json={ |
| 137 | "model": model, |
| 138 | "prompt": prompt, |
| 139 | "system": DECOY_SYSTEM_PROMPT, |
| 140 | "stream": False, |
| 141 | "options": {"temperature": 0.8, "top_p": 0.9, "num_predict": 800}, |
| 142 | }, |
| 143 | timeout=timeout, |
| 144 | ) |
| 145 | r.raise_for_status() |
| 146 | return r.json()["response"] |
| 147 | |
| 148 | |
| 149 | def _generate_static(req: DecoyRequest) -> str: |
| 150 | """Offline fallback. Good enough for testing; not production.""" |
| 151 | lines = [ |
| 152 | f"INTERNAL - {req.filename}", |
| 153 | f"Topic: {req.topic_hint}", |
| 154 | "", |
| 155 | "Summary", |
| 156 | "-------", |
| 157 | f"This document covers the {req.topic_hint}. It is distributed to a", |
| 158 | "limited group and should not be shared externally. Figures cited below", |
| 159 | "are preliminary and subject to revision.", |
| 160 | "", |
| 161 | "Key points", |
| 162 | "----------", |
| 163 | "- Reviewed by: A. Smith, J. Chen", |
| 164 | "- Next review: Q3 2026", |
| 165 | "- Distribution: executive leadership only", |
| 166 | "- Classification: CONFIDENTIAL - RESTRICTED", |
| 167 | "", |
| 168 | "Background", |
| 169 | "----------", |
| 170 | ] |
| 171 | for i in range(30): |
| 172 | lines.append( |
| 173 | f"Paragraph {i+1}: standard corporate filler content for the " |
| 174 | f"{req.topic_hint} topic, written to give plausible body to a " |
| 175 | f"decoy document." |
| 176 | ) |
| 177 | return "\n".join(lines) |
| 178 | |
| 179 | |
| 180 | def generate_decoy( |
| 181 | req: DecoyRequest, |
| 182 | backend: str = None, |
| 183 | ollama_url: str = DEFAULT_OLLAMA, |
| 184 | model: str = DEFAULT_MODEL, |
| 185 | ) -> str: |
| 186 | """Generate a single decoy document body. Returns the text content.""" |
| 187 | backend = backend or os.environ.get("OVERSIGHT_DECOY_BACKEND", "ollama") |
| 188 | |
| 189 | try: |
| 190 | if backend == "ollama": |
| 191 | return _generate_ollama(req, ollama_url=ollama_url, model=model) |
| 192 | except Exception as e: |
| 193 | print(f"[decoy] backend '{backend}' failed ({e}); falling back to static") |
| 194 | |
| 195 | return _generate_static(req) |
| 196 | |
| 197 | |
| 198 | def generate_decoy_set( |
| 199 | n: int = 5, |
| 200 | filenames: Optional[list[str]] = None, |
| 201 | context: Optional[str] = None, |
| 202 | backend: str = None, |
| 203 | ) -> list[tuple[str, str]]: |
| 204 | """ |
| 205 | Generate N decoys. Returns list of (filename, body) tuples. |
| 206 | """ |
| 207 | names = filenames or random.sample(DEFAULT_DECOY_NAMES, min(n, len(DEFAULT_DECOY_NAMES))) |
| 208 | out = [] |
| 209 | for name in names[:n]: |
| 210 | req = DecoyRequest( |
| 211 | filename=name, |
| 212 | topic_hint=_topic_from_filename(name), |
| 213 | context=context, |
| 214 | ) |
| 215 | body = generate_decoy(req, backend=backend) |
| 216 | out.append((name, body)) |
| 217 | return out |