| @@ -1,5 +1,31 @@ | ||
| # Oversight CHANGELOG | ||
| + | ## v0.4.5 - 2026-04-20 L3 safety, GUI, and registry federation docs | |
| + | ||
| + | Review-driven hardening from `P:/Oversight/oversight-protocol-review.md`. | |
| + | ||
| + | - `oversight_core/l3_policy.py`: new L3 safety policy engine. L3 defaults off | |
| + | for legal, regulatory, technical/spec, source-code, SQL, log, and structured | |
| + | data classes; explicit `full`, `boilerplate`, and `off` modes are supported. | |
| + | - `cli/oversight.py` and `cli/oversight_rich.py`: seal-time L3 disclosure now | |
| + | requires acknowledgement when L3 is enabled, and seal manifests record the | |
| + | applied L3 policy. | |
| + | - `oversight_core/manifest.py`: manifests now carry `canonical_content_hash` | |
| + | so auditors can diff recipient copies against the original source bytes. | |
| + | - `oversight_core/watermark.py` and `oversight_core/formats/text.py`: high-level | |
| + | L3 application is opt-in; L1/L2 remain available by default. | |
| + | - `cli/gui.py`: added a Tkinter desktop GUI for key generation, sealing, and | |
| + | opening files (`oversight gui`) so non-technical users have a starter path. | |
| + | - `docs/security.md`: documented L3 collusion/canonicalization limits, layer | |
| + | survival properties, passive beacon limits, jurisdiction-by-IP limits, and | |
| + | RFC 3161 timestamp semantics. | |
| + | - `docs/spec/registry-v1.md`: added a registry federation/interoperability | |
| + | draft for independent compatible registry operators. | |
| + | - `docs/ROADMAP.md`: corrected launch sequencing, dropped near-term FedRAMP, | |
| + | scoped ecosystem plugins to Outlook-first, and prioritized SIEM integration | |
| + | before SOC 2 / ISO 27001 work. | |
| + | - Added focused regression coverage in `tests/test_l3_policy_unit.py`. | |
| + | ||
| ## v0.4.4 - 2026-04-20 security hardening | ||
| Security patch line started from the `v0.4.3` Python package baseline |
| @@ -4,7 +4,7 @@ | ||
| Co-authored by Zion Boggan and Claude Opus 4.6/4.7 (Anthropic) and Codex ChatGPT-5-4 (OpenAI). | ||
| - | Format-agnostic. Post-quantum ready (ML-KEM-768 + ML-DSA-65). Three-layer watermarking that survives format conversion, invisible-char stripping, and screenshot/OCR. Content fingerprinting that identifies leaked copies even when all watermarks are destroyed. | |
| + | Format-agnostic. Post-quantum ready (ML-KEM-768 + ML-DSA-65). Layered watermarking with honest limits: L1/L2 are lightweight steganographic signals, L3 is opt-in semantic marking for prose, and content fingerprinting helps identify leaked copies even when fragile marks are destroyed. | |
| No cloud vendor lock-in. No paid service required. No custom cryptography. Apache 2.0. | ||
| @@ -71,11 +71,13 @@ oversight attribute --leak leaked.txt --fingerprints .oversight/fingerprints | ||
| ### What happens when you seal | ||
| - | The seal command applies three watermark layers to the document, each targeting a different attack surface: | |
| + | The seal command applies watermark layers to the document, each targeting a different attack surface: | |
| - **L1** inserts zero-width Unicode characters (survives copy-paste) | ||
| - **L2** encodes bits in trailing whitespace patterns (survives most editors) | ||
| - | - **L3** rotates synonyms from a 151-class dictionary, adjusts punctuation style, spelling variants, and contractions (survives format conversion, invisible-char stripping, and screenshot/OCR) | |
| + | - **L3** optionally rotates prose choices from a 151-class dictionary (survives format conversion and screenshot/OCR, but changes visible text and can be defeated by motivated collusion/canonicalization) | |
| + | ||
| + | L3 defaults off for legal documents, regulatory filings, technical specifications, source code, SQL, logs, and structured data. When L3 is enabled, Oversight asks for explicit acknowledgement and records `canonical_content_hash` in the signed manifest so disputes can compare the recipient copy against the canonical source. | |
| Then it encrypts to the recipient's X25519 public key, timestamps via RFC 3161, logs to the Merkle tree, and writes the `.sealed` file plus a `.fingerprint.json` sidecar for the content fingerprint database. | ||
| @@ -93,6 +95,20 @@ The attribute command runs a 5-phase pipeline: | ||
| 4. **Multi-layer Bayesian fusion** combining all evidence into ranked candidates | ||
| 5. **Content fingerprint comparison** (winnowing + sentence hashing) as a last resort when all watermarks are stripped | ||
| + | ## What's new in v0.4.5 | |
| + | ||
| + | **L3 safety and usability.** Semantic watermarking is now format-aware and | |
| + | opt-in for sensitive classes, with full/boilerplate/off modes, disclosure | |
| + | acknowledgement, canonical source hashing, protected-region skips, and explicit | |
| + | collusion/threat-model documentation in `docs/security.md`. | |
| + | ||
| + | **GUI starter.** `oversight gui` launches a small desktop app for key | |
| + | generation, sealing, and opening files so non-technical recipients are not | |
| + | forced through the CLI. | |
| + | ||
| + | **Registry federation draft.** `docs/spec/registry-v1.md` documents the | |
| + | interoperability contract for compatible registry operators. | |
| + | ||
| ## What's new in v0.4.4 | ||
| **Security hardening over v0.4.3.** This line starts from the v0.4.3 Python | ||
| @@ -121,7 +137,7 @@ See `CHANGELOG.md` for full version history. | ||
| ## Security hardening | ||
| - | These items are included in v0.4.4 and current `main`: | |
| + | These items are included in v0.4.4/v0.4.5 and current `main`: | |
| - `max_opens` now counts only successful recipient decryptions, not failed key guesses. | ||
| - `LOCAL_ONLY` open counters now work on Windows as well as POSIX hosts. | ||
| @@ -135,6 +151,8 @@ These items are included in v0.4.4 and current `main`: | ||
| authenticated DNS beacon callbacks, no silent signed-artifact drops, | ||
| digest-checked Rekor offline verification, fail-closed Rust `max_opens`, | ||
| DOCX keyword insertion, and PDF action screening. | ||
| + | - L3 semantic watermarking is opt-in for sensitive classes, requires | |
| + | disclosure acknowledgement when enabled, and records `canonical_content_hash`. | |
| ## Repository layout | ||
| @@ -0,0 +1,199 @@ | ||
| + | """Small Tkinter GUI for non-technical Oversight users.""" | |
| + | ||
| + | from __future__ import annotations | |
| + | ||
| + | import json | |
| + | from pathlib import Path | |
| + | import tkinter as tk | |
| + | from tkinter import filedialog, messagebox, ttk | |
| + | ||
| + | from oversight_core import ( | |
| + | ClassicIdentity, | |
| + | Manifest, | |
| + | Recipient, | |
| + | WatermarkRef, | |
| + | beacon, | |
| + | content_hash, | |
| + | l3_policy, | |
| + | open_sealed, | |
| + | seal, | |
| + | watermark, | |
| + | ) | |
| + | from oversight_core.fingerprint import ContentFingerprint | |
| + | ||
| + | ||
| + | class OversightGui(tk.Tk): | |
| + | def __init__(self) -> None: | |
| + | super().__init__() | |
| + | self.title("Oversight Protocol") | |
| + | self.geometry("760x540") | |
| + | self._build() | |
| + | ||
| + | def _build(self) -> None: | |
| + | notebook = ttk.Notebook(self) | |
| + | notebook.pack(fill="both", expand=True, padx=12, pady=12) | |
| + | self._build_keygen(notebook) | |
| + | self._build_seal(notebook) | |
| + | self._build_open(notebook) | |
| + | ||
| + | def _row(self, parent, label: str, row: int, browse: bool = False): | |
| + | ttk.Label(parent, text=label).grid(row=row, column=0, sticky="w", pady=4) | |
| + | var = tk.StringVar() | |
| + | ent = ttk.Entry(parent, textvariable=var, width=72) | |
| + | ent.grid(row=row, column=1, sticky="ew", pady=4) | |
| + | if browse: | |
| + | ttk.Button(parent, text="Browse", command=lambda: self._browse(var)).grid(row=row, column=2, padx=4) | |
| + | parent.columnconfigure(1, weight=1) | |
| + | return var | |
| + | ||
| + | def _browse(self, var: tk.StringVar, save: bool = False) -> None: | |
| + | path = filedialog.asksaveasfilename() if save else filedialog.askopenfilename() | |
| + | if path: | |
| + | var.set(path) | |
| + | ||
| + | def _build_keygen(self, notebook) -> None: | |
| + | frame = ttk.Frame(notebook, padding=12) | |
| + | notebook.add(frame, text="Generate Keys") | |
| + | identity_id = self._row(frame, "Identity name", 0) | |
| + | identity_id.set("alice") | |
| + | out = self._row(frame, "Private key output", 1) | |
| + | ttk.Button(frame, text="Choose Output", command=lambda: self._browse(out, save=True)).grid(row=1, column=2, padx=4) | |
| + | ttk.Button(frame, text="Generate Keypair", command=lambda: self._keygen(identity_id.get(), out.get())).grid(row=2, column=1, sticky="e", pady=12) | |
| + | ||
| + | def _build_seal(self, notebook) -> None: | |
| + | frame = ttk.Frame(notebook, padding=12) | |
| + | notebook.add(frame, text="Seal File") | |
| + | self.seal_input = self._row(frame, "Input file", 0, True) | |
| + | self.seal_issuer = self._row(frame, "Issuer private key", 1, True) | |
| + | self.seal_recipient = self._row(frame, "Recipient public key", 2, True) | |
| + | self.seal_out = self._row(frame, "Sealed output", 3) | |
| + | ttk.Button(frame, text="Choose Output", command=lambda: self._browse(self.seal_out, save=True)).grid(row=3, column=2, padx=4) | |
| + | self.registry_url = self._row(frame, "Registry URL", 4) | |
| + | self.registry_url.set("https://registry.oversightprotocol.dev") | |
| + | self.content_type = self._row(frame, "Content type", 5) | |
| + | self.content_type.set("text/plain") | |
| + | self.l3_mode = tk.StringVar(value="auto") | |
| + | ttk.Label(frame, text="L3 mode").grid(row=6, column=0, sticky="w", pady=4) | |
| + | ttk.Combobox(frame, textvariable=self.l3_mode, values=["auto", "off", "boilerplate", "full"], state="readonly").grid(row=6, column=1, sticky="w") | |
| + | self.watermark_enabled = tk.BooleanVar(value=True) | |
| + | ttk.Checkbutton(frame, text="Embed L1/L2 watermarks", variable=self.watermark_enabled).grid(row=7, column=1, sticky="w") | |
| + | ttk.Button(frame, text="Seal", command=self._seal_file).grid(row=8, column=1, sticky="e", pady=12) | |
| + | ||
| + | def _build_open(self, notebook) -> None: | |
| + | frame = ttk.Frame(notebook, padding=12) | |
| + | notebook.add(frame, text="Open File") | |
| + | self.open_input = self._row(frame, "Sealed file", 0, True) | |
| + | self.open_identity = self._row(frame, "Recipient private key", 1, True) | |
| + | self.open_out = self._row(frame, "Plaintext output", 2) | |
| + | ttk.Button(frame, text="Choose Output", command=lambda: self._browse(self.open_out, save=True)).grid(row=2, column=2, padx=4) | |
| + | ttk.Button(frame, text="Open", command=self._open_file).grid(row=3, column=1, sticky="e", pady=12) | |
| + | ||
| + | def _keygen(self, identity_id: str, out_path: str) -> None: | |
| + | try: | |
| + | ident = ClassicIdentity.generate() | |
| + | out = { | |
| + | "id": identity_id or "identity", | |
| + | "x25519_priv": ident.x25519_priv.hex(), | |
| + | "x25519_pub": ident.x25519_pub.hex(), | |
| + | "ed25519_priv": ident.ed25519_priv.hex(), | |
| + | "ed25519_pub": ident.ed25519_pub.hex(), | |
| + | } | |
| + | path = Path(out_path) | |
| + | path.write_text(json.dumps(out, indent=2)) | |
| + | path.with_suffix(".pub.json").write_text(json.dumps({ | |
| + | "id": out["id"], | |
| + | "x25519_pub": out["x25519_pub"], | |
| + | "ed25519_pub": out["ed25519_pub"], | |
| + | }, indent=2)) | |
| + | messagebox.showinfo("Oversight", "Keypair generated.") | |
| + | except Exception as exc: | |
| + | messagebox.showerror("Oversight", str(exc)) | |
| + | ||
| + | def _seal_file(self) -> None: | |
| + | try: | |
| + | input_path = Path(self.seal_input.get()) | |
| + | plaintext = input_path.read_bytes() | |
| + | canonical_plaintext = plaintext | |
| + | issuer = json.loads(Path(self.seal_issuer.get()).read_text()) | |
| + | rec_pub = json.loads(Path(self.seal_recipient.get()).read_text()) | |
| + | watermarks: list[WatermarkRef] = [] | |
| + | decision = None | |
| + | ||
| + | if self.watermark_enabled.get(): | |
| + | text = plaintext.decode("utf-8") | |
| + | mark_id = watermark.new_mark_id() | |
| + | decision = l3_policy.decide_l3( | |
| + | filename=str(input_path), | |
| + | content_type=self.content_type.get(), | |
| + | text=text, | |
| + | requested_mode=self.l3_mode.get(), | |
| + | ) | |
| + | if decision.enabled: | |
| + | if not messagebox.askyesno( | |
| + | "L3 disclosure", | |
| + | "L3 semantic watermarking changes visible prose. Continue?", | |
| + | ): | |
| + | return | |
| + | text = l3_policy.apply_l3_safe(text, mark_id, mode=decision.mode) | |
| + | watermarks.append(WatermarkRef(f"L3_semantic_{decision.mode}", mark_id.hex())) | |
| + | text = watermark.embed_ws(text, mark_id) | |
| + | text = watermark.embed_zw(text, mark_id) | |
| + | plaintext = text.encode("utf-8") | |
| + | watermarks.extend([ | |
| + | WatermarkRef("L1_zero_width", mark_id.hex()), | |
| + | WatermarkRef("L2_whitespace", mark_id.hex()), | |
| + | ]) | |
| + | ||
| + | recipient = Recipient(rec_pub["id"], rec_pub["x25519_pub"], rec_pub.get("ed25519_pub")) | |
| + | manifest = Manifest.new( | |
| + | input_path.name, | |
| + | content_hash(plaintext), | |
| + | len(plaintext), | |
| + | issuer.get("id", "issuer"), | |
| + | issuer["ed25519_pub"], | |
| + | recipient, | |
| + | self.registry_url.get(), | |
| + | self.content_type.get(), | |
| + | ) | |
| + | manifest.canonical_content_hash = content_hash(canonical_plaintext) | |
| + | manifest.watermarks = watermarks | |
| + | manifest.l3_policy = decision.to_dict() if decision else {} | |
| + | manifest.beacons = [ | |
| + | b.to_dict() for b in beacon.gen_beacons("oversightprotocol.dev", "pending", rec_pub["id"]) | |
| + | ] | |
| + | out_path = Path(self.seal_out.get() or f"{input_path}.sealed") | |
| + | blob = seal(plaintext, manifest, bytes.fromhex(issuer["ed25519_priv"]), bytes.fromhex(rec_pub["x25519_pub"])) | |
| + | out_path.write_bytes(blob) | |
| + | if watermarks: | |
| + | fp = ContentFingerprint.from_text(plaintext.decode("utf-8", errors="replace")) | |
| + | out_path.with_suffix(".fingerprint.json").write_text(json.dumps({ | |
| + | "file_id": manifest.file_id, | |
| + | "recipient_id": rec_pub["id"], | |
| + | "canonical_content_hash": manifest.canonical_content_hash, | |
| + | "l3_policy": manifest.l3_policy, | |
| + | "fingerprint": fp.to_dict(), | |
| + | }, indent=2)) | |
| + | messagebox.showinfo("Oversight", f"Sealed file written.\nfile_id={manifest.file_id}") | |
| + | except Exception as exc: | |
| + | messagebox.showerror("Oversight", str(exc)) | |
| + | ||
| + | def _open_file(self) -> None: | |
| + | try: | |
| + | ident = json.loads(Path(self.open_identity.get()).read_text()) | |
| + | plaintext, _manifest = open_sealed( | |
| + | Path(self.open_input.get()).read_bytes(), | |
| + | bytes.fromhex(ident["x25519_priv"]), | |
| + | ) | |
| + | Path(self.open_out.get()).write_bytes(plaintext) | |
| + | messagebox.showinfo("Oversight", "File opened.") | |
| + | except Exception as exc: | |
| + | messagebox.showerror("Oversight", str(exc)) | |
| + | ||
| + | ||
| + | def main() -> None: | |
| + | app = OversightGui() | |
| + | app.mainloop() | |
| + | ||
| + | ||
| + | if __name__ == "__main__": | |
| + | main() |
| @@ -44,6 +44,7 @@ from oversight_core import ( | ||
| open_sealed, | ||
| beacon, | ||
| watermark, | ||
| + | l3_policy, | |
| ) | ||
| from oversight_core.container import SealedFile | ||
| from oversight_core import semantic | ||
| @@ -80,8 +81,11 @@ def cmd_seal(args): | ||
| issuer = json.loads(Path(args.issuer_key).read_text()) | ||
| rec_pub = json.loads(Path(args.recipient_pub).read_text()) | ||
| + | canonical_plaintext = plaintext | |
| + | ||
| # Optional watermarking (text files only) | ||
| watermarks_for_manifest: list[WatermarkRef] = [] | ||
| + | l3_decision = None | |
| if args.watermark: | ||
| try: | ||
| text = plaintext.decode("utf-8") | ||
| @@ -94,11 +98,25 @@ def cmd_seal(args): | ||
| # attribution (one ID per recipient, not one per layer). | ||
| mark_id = watermark.new_mark_id() | ||
| - | # Apply layers in correct order: L3 first (rewrites words), | |
| - | # then L2 (trailing whitespace), then L1 (zero-width chars). | |
| - | # This prevents L1's invisible chars from fragmenting L3 synonym | |
| - | # words during embedding. | |
| - | text = semantic.apply_semantic(text, mark_id) | |
| + | l3_decision = l3_policy.decide_l3( | |
| + | filename=args.input, | |
| + | content_type=args.content_type, | |
| + | text=text, | |
| + | declared_class=args.document_class, | |
| + | requested_mode=args.l3_mode, | |
| + | ) | |
| + | ||
| + | if l3_decision.enabled: | |
| + | if not args.l3_ack and not _confirm_l3(l3_decision): | |
| + | raise SystemExit( | |
| + | "L3 changes visible text. Re-run with --l3-mode off, " | |
| + | "--l3-mode boilerplate, or --l3-ack to acknowledge." | |
| + | ) | |
| + | text = l3_policy.apply_l3_safe(text, mark_id, mode=l3_decision.mode) | |
| + | watermarks_for_manifest.append(WatermarkRef( | |
| + | layer=f"L3_semantic_{l3_decision.mode}", mark_id=mark_id.hex() | |
| + | )) | |
| + | ||
| text = watermark.embed_ws(text, mark_id) | ||
| text = watermark.embed_zw(text, mark_id) | ||
| plaintext = text.encode("utf-8") | ||
| @@ -109,12 +127,12 @@ def cmd_seal(args): | ||
| watermarks_for_manifest.append(WatermarkRef( | ||
| layer="L2_whitespace", mark_id=mark_id.hex() | ||
| )) | ||
| - | watermarks_for_manifest.append(WatermarkRef( | |
| - | layer="L3_semantic", mark_id=mark_id.hex() | |
| - | )) | |
| print(f"[+] embedded L1 mark {mark_id.hex()}") | ||
| print(f"[+] embedded L2 mark {mark_id.hex()}") | ||
| - | print(f"[+] embedded L3 mark {mark_id.hex()} (semantic + punctuation)") | |
| + | if l3_decision and l3_decision.enabled: | |
| + | print(f"[+] embedded L3 mark {mark_id.hex()} ({l3_decision.mode})") | |
| + | elif l3_decision: | |
| + | print(f"[!] L3 skipped: {l3_decision.reason} ({'; '.join(l3_decision.warnings)})") | |
| # Recipient | ||
| recipient = Recipient( | ||
| @@ -140,6 +158,9 @@ def cmd_seal(args): | ||
| registry_url=args.registry_url, | ||
| content_type=args.content_type, | ||
| ) | ||
| + | manifest.canonical_content_hash = content_hash(canonical_plaintext) | |
| + | if l3_decision: | |
| + | manifest.l3_policy = l3_decision.to_dict() | |
| manifest.watermarks = watermarks_for_manifest | ||
| manifest.beacons = [b.to_dict() for b in beacons] | ||
| @@ -175,6 +196,8 @@ def cmd_seal(args): | ||
| "file_id": manifest.file_id, | ||
| "recipient_id": rec_pub["id"], | ||
| "mark_id": watermarks_for_manifest[0].mark_id if watermarks_for_manifest else None, | ||
| + | "canonical_content_hash": manifest.canonical_content_hash, | |
| + | "l3_policy": manifest.l3_policy, | |
| "fingerprint": fingerprint.to_dict(), | ||
| }, indent=2)) | ||
| print(f"[+] wrote fingerprint to {fp_path}") | ||
| @@ -218,6 +241,16 @@ def cmd_open(args): | ||
| print(f"[+] beacons = {len(manifest.beacons)}") | ||
| + | def _confirm_l3(decision) -> bool: | |
| + | print("[!] L3 semantic watermarking changes visible prose.") | |
| + | print(f" document_class={decision.document_class} mode={decision.mode}") | |
| + | print(f" reason={decision.reason}") | |
| + | if not sys.stdin.isatty(): | |
| + | return False | |
| + | answer = input(" Type 'I ACKNOWLEDGE' to continue: ").strip() | |
| + | return answer == "I ACKNOWLEDGE" | |
| + | ||
| + | ||
| # ---------------- inspect ---------------- | ||
| def cmd_inspect(args): | ||
| @@ -421,6 +454,15 @@ def main(): | ||
| s.add_argument("--out", required=True) | ||
| s.add_argument("--content-type", default="application/octet-stream") | ||
| s.add_argument("--watermark", action="store_true", help="embed text watermarks") | ||
| + | s.add_argument("--l3-mode", choices=("auto", "off", "full", "boilerplate"), default="auto", | |
| + | help="semantic L3 mode; auto disables L3 for wording-sensitive document classes") | |
| + | s.add_argument("--l3-ack", action="store_true", | |
| + | help="acknowledge that enabled L3 makes recipient text non-identical") | |
| + | s.add_argument("--document-class", | |
| + | choices=("auto", "prose", "legal", "regulatory", "technical_spec", | |
| + | "source_code", "sql", "log", "structured_data"), | |
| + | default="auto", | |
| + | help="declare document class for L3 safety decisions") | |
| s.add_argument("--register", default=None, help="POST manifest to this registry URL") | ||
| o = sub.add_parser("open") |
| @@ -45,17 +45,17 @@ from oversight_core import ( | ||
| open_sealed, | ||
| beacon, | ||
| watermark, | ||
| + | l3_policy, | |
| __version__ as core_version, | ||
| ) | ||
| from oversight_core.container import SealedFile | ||
| - | from oversight_core import semantic | |
| from oversight_core.fingerprint import ContentFingerprint | ||
| # --------------------------------------------------------------------------- | ||
| # Constants | ||
| # --------------------------------------------------------------------------- | ||
| - | CLI_VERSION = "0.4.4" | |
| + | CLI_VERSION = "0.4.5" | |
| CONFIG_FILENAME = "config.json" | ||
| CONFIG_DIR_NAME = ".oversight" | ||
| @@ -511,9 +511,11 @@ def cmd_seal(args): | ||
| do_watermark = args.watermark if args.watermark is not None else cfg.get("default_watermark", True) | ||
| content_type_val = args.content_type or cfg.get("content_type", "application/octet-stream") | ||
| + | canonical_plaintext = plaintext | |
| watermarks_for_manifest: list[WatermarkRef] = [] | ||
| fingerprint = None | ||
| mark_id = None | ||
| + | l3_decision = None | |
| # Run the seal pipeline with progress | ||
| with Progress( | ||
| @@ -537,9 +539,35 @@ def cmd_seal(args): | ||
| if text is not None: | ||
| mark_id = watermark.new_mark_id() | ||
| + | l3_decision = l3_policy.decide_l3( | |
| + | filename=str(input_path), | |
| + | content_type=content_type_val, | |
| + | text=text, | |
| + | declared_class=args.document_class, | |
| + | requested_mode=args.l3_mode, | |
| + | ) | |
| - | progress.update(task, description="Watermarking L3 (semantic)...") | |
| - | text = semantic.apply_semantic(text, mark_id) | |
| + | progress.update(task, description="Evaluating L3 safety policy...") | |
| + | if l3_decision.enabled: | |
| + | progress.stop() | |
| + | if not args.l3_ack: | |
| + | console.print(Panel( | |
| + | "L3 semantic watermarking changes visible prose. " | |
| + | f"Class: [bold]{l3_decision.document_class}[/], " | |
| + | f"mode: [bold]{l3_decision.mode}[/].\n\n" | |
| + | "Enable only when you accept that the recipient copy " | |
| + | "is textually non-identical to the canonical source.", | |
| + | title="[yellow]L3 Disclosure[/]", | |
| + | border_style="yellow", | |
| + | )) | |
| + | if not Confirm.ask("Acknowledge and apply L3?", default=False): | |
| + | error_panel("L3 not acknowledged. Re-run with --l3-mode off or --l3-ack.") | |
| + | sys.exit(1) | |
| + | progress.start() | |
| + | progress.update(task, description=f"Watermarking L3 ({l3_decision.mode})...") | |
| + | text = l3_policy.apply_l3_safe(text, mark_id, mode=l3_decision.mode) | |
| + | else: | |
| + | progress.update(task, description=f"Skipping L3: {l3_decision.document_class}") | |
| progress.advance(task) | ||
| progress.update(task, description="Watermarking L2 (whitespace)...") | ||
| @@ -554,8 +582,11 @@ def cmd_seal(args): | ||
| watermarks_for_manifest = [ | ||
| WatermarkRef(layer="L1_zero_width", mark_id=mark_id.hex()), | ||
| WatermarkRef(layer="L2_whitespace", mark_id=mark_id.hex()), | ||
| - | WatermarkRef(layer="L3_semantic", mark_id=mark_id.hex()), | |
| ] | ||
| + | if l3_decision and l3_decision.enabled: | |
| + | watermarks_for_manifest.append( | |
| + | WatermarkRef(layer=f"L3_semantic_{l3_decision.mode}", mark_id=mark_id.hex()) | |
| + | ) | |
| # Step 4: Build manifest | ||
| progress.update(task, description="Building manifest...") | ||
| @@ -581,6 +612,9 @@ def cmd_seal(args): | ||
| registry_url=registry_url, | ||
| content_type=content_type_val, | ||
| ) | ||
| + | manifest.canonical_content_hash = content_hash(canonical_plaintext) | |
| + | if l3_decision: | |
| + | manifest.l3_policy = l3_decision.to_dict() | |
| manifest.watermarks = watermarks_for_manifest | ||
| manifest.beacons = [b.to_dict() for b in beacons] | ||
| progress.advance(task) | ||
| @@ -614,6 +648,8 @@ def cmd_seal(args): | ||
| "file_id": manifest.file_id, | ||
| "recipient_id": rec_pub["id"], | ||
| "mark_id": mark_id.hex() if mark_id else None, | ||
| + | "canonical_content_hash": manifest.canonical_content_hash, | |
| + | "l3_policy": manifest.l3_policy, | |
| "fingerprint": fingerprint.to_dict(), | ||
| }, indent=2)) | ||
| @@ -629,6 +665,8 @@ def cmd_seal(args): | ||
| table.add_row("Issuer", issuer_id) | ||
| table.add_row("Recipient", rec_pub["id"]) | ||
| table.add_row("Watermarks", str(len(watermarks_for_manifest))) | ||
| + | if l3_decision: | |
| + | table.add_row("L3 policy", f"{l3_decision.mode} ({l3_decision.document_class})") | |
| table.add_row("Beacons", str(len(beacons))) | ||
| table.add_row("Suite", "OSGT-CLASSIC-v1") | ||
| if mark_id: | ||
| @@ -1257,6 +1295,7 @@ def build_parser() -> argparse.ArgumentParser: | ||
| ) | ||
| p.add_argument("--no-banner", action="store_true", help="suppress startup banner") | ||
| sub = p.add_subparsers(dest="cmd") | ||
| + | sub.add_parser("gui", help="launch the graphical desktop app") | |
| # init | ||
| init_p = sub.add_parser("init", help="initialize .oversight/ directory") | ||
| @@ -1292,6 +1331,15 @@ def build_parser() -> argparse.ArgumentParser: | ||
| seal_p.add_argument("--content-type", default=None, help="MIME content type") | ||
| seal_p.add_argument("--watermark", default=None, action="store_true", help="embed watermarks (default from config)") | ||
| seal_p.add_argument("--no-watermark", dest="watermark", action="store_false", help="skip watermarks") | ||
| + | seal_p.add_argument("--l3-mode", choices=("auto", "off", "full", "boilerplate"), default="auto", | |
| + | help="semantic L3 mode; auto disables L3 for wording-sensitive documents") | |
| + | seal_p.add_argument("--l3-ack", action="store_true", | |
| + | help="acknowledge enabled L3 makes recipient text non-identical") | |
| + | seal_p.add_argument("--document-class", | |
| + | choices=("auto", "prose", "legal", "regulatory", "technical_spec", | |
| + | "source_code", "sql", "log", "structured_data"), | |
| + | default="auto", | |
| + | help="declare document class for L3 safety decisions") | |
| seal_p.add_argument("--register", default=None, help="POST manifest to this registry URL") | ||
| # open | ||
| @@ -1343,6 +1391,11 @@ def main(): | ||
| parser.print_help() | ||
| sys.exit(0) | ||
| + | if args.cmd == "gui": | |
| + | from cli.gui import main as gui_main | |
| + | gui_main() | |
| + | return | |
| + | ||
| if show_banner: | ||
| print_banner() | ||
| @@ -1,5 +1,25 @@ | ||
| # Oversight Roadmap | ||
| + | ## April 20, 2026 correction | |
| + | ||
| + | The launch plan is now gated on product usability and threat-model honesty: | |
| + | ||
| + | 1. **L3 safety fixes and collusion docs** - shipped in v0.4.5: L3 defaults off for wording-sensitive document classes, requires explicit disclosure when enabled, records `canonical_content_hash`, and supports a boilerplate-only mode. | |
| + | 2. **Web viewer / drag-drop share UI** - next website/product milestone. Do not launch broadly on HN/Reddit until non-technical recipients can open and inspect Oversight files without the CLI. | |
| + | 3. **Outlook add-in only** for the first ecosystem integration. Defer Drive, Box, SharePoint, and Teams plugins until there is a maintainer or design partner paying for them. | |
| + | 4. **SIEM integration before SOC 2**: prioritize Splunk HEC, Microsoft Sentinel, and Elastic Common Schema exports because they are fast and high enterprise ROI. | |
| + | 5. **SOC 2 Type 1 scoping** is realistic after a design partner. ISO 27001 comes after SOC 2. **FedRAMP is dropped from near-term planning**; it is a multi-year commercial program requiring sponsor-agency backing. | |
| + | 6. **Registry federation**: publish and harden `docs/spec/registry-v1.md` during the Rust Axum/SQLx registry work so a second operator can run a compatible registry. | |
| + | ||
| + | Correct public-launch sequence: | |
| + | ||
| + | 1. L3 safety + collusion documentation. | |
| + | 2. GUI / web viewer / drag-drop share workflow. | |
| + | 3. Outlook add-in. | |
| + | 4. One regulated-industry design partner deployment. | |
| + | 5. SOC 2 Type 1 scoping in parallel. | |
| + | 6. Public launch after the above, not while CLI-only. | |
| + | ||
| This roadmap tracks work that lives outside a single release cut: external | ||
| integrations, spec publication, third-party review, and community milestones. | ||
| Every item references real upstream projects with current links so the plan |
| @@ -131,6 +131,8 @@ The manifest is canonical JSON (sorted keys, no whitespace, UTF-8). Required fie | ||
| - `version` (`"OVERSIGHT-v1"`) | ||
| - `suite` (suite identifier string) | ||
| - `content_hash` (hex SHA-256 of plaintext) | ||
| + | - `canonical_content_hash` (hex SHA-256 of the source bytes before | |
| + | L3/L2/L1 watermarking; used to resolve wording disputes) | |
| - `size_bytes` (plaintext length) | ||
| - `issuer_id` (string) | ||
| - `issuer_ed25519_pub` (hex) | ||
| @@ -143,6 +145,8 @@ Optional fields: | ||
| - `watermarks` (array of `{layer, mark_id}`) | ||
| - `beacons` (array of beacon descriptors) | ||
| - `policy` (`not_after`, `max_opens`, `jurisdiction`, `registry_url`, `require_attestation`) | ||
| + | - `l3_policy` (object describing L3 mode, document class, disclosure state, | |
| + | and safety rationale) | |
| - `signature_ml_dsa` (hex, for HYBRID suites) | ||
| ### 5.3 DEK wrapping | ||
| @@ -165,13 +169,20 @@ After decryption, the implementation MUST verify that `SHA-256(plaintext) == man | ||
| ## 6. Watermarking | ||
| - | Watermarking is optional but RECOMMENDED. Each applied layer registers a `mark_id` in the manifest. | |
| + | Watermarking is optional but RECOMMENDED. Each applied layer registers a | |
| + | `mark_id` in the manifest. L3 semantic watermarking changes visible prose and | |
| + | is therefore opt-in for wording-sensitive classes. Implementations MUST | |
| + | default L3 off for legal documents, regulatory filings, technical | |
| + | specifications, source code, SQL, logs, and structured data unless the user | |
| + | explicitly enables and acknowledges the textual change. | |
| ### 6.1 Layer identifiers | ||
| - `L1_zero_width` - zero-width unicode characters scattered through text payloads | ||
| - `L2_whitespace` - trailing space vs tab at line endings | ||
| - | - `L3_synonyms` - synonym-class rotation (reserved; MVP stub) | |
| + | - `L3_synonyms` - legacy synonym-class rotation identifier | |
| + | - `L3_semantic_full` - guarded semantic marks over eligible prose regions | |
| + | - `L3_semantic_boilerplate` - guarded semantic marks limited to header/footer/cover-page regions | |
| - `L4_dct_visual` - reserved; for image payloads | ||
| - `L5_layout` - reserved; for PDF/document layout perturbation | ||
| @@ -0,0 +1,73 @@ | ||
| + | # Oversight Security Notes | |
| + | ||
| + | This document is the honest threat-model companion to the protocol spec. It | |
| + | uses RFC 2119 / BCP 14 language for requirements; those terms are interpreted | |
| + | only when written in all capitals. | |
| + | ||
| + | ## Watermark Layer Limits | |
| + | ||
| + | | Layer | Screenshot | Reformat | Manual retype | Motivated adversary with vocab | | |
| + | |-------|------------|----------|---------------|--------------------------------| | |
| + | | L1 zero-width | No | Often no | No | No | | |
| + | | L2 whitespace | No | No | No | No | | |
| + | | L3 semantic | Yes | Yes | Often yes | No; canonicalization can defeat it | | |
| + | ||
| + | L1 and L2 are steganographic convenience layers. They are useful forensic | |
| + | signals but fragile against normalization. L3 is stronger because it encodes | |
| + | choices in visible prose, but that means it changes the recipient copy. | |
| + | ||
| + | ## L3 Semantic Watermark Safety | |
| + | ||
| + | L3 is opt-in for wording-sensitive documents. The seal path defaults L3 off | |
| + | for legal documents, regulatory filings, technical specifications, source | |
| + | code, SQL, logs, and structured data. When L3 is enabled, users must | |
| + | acknowledge that the recipient copy is textually non-identical to the | |
| + | canonical source. The manifest records `canonical_content_hash` so a dispute | |
| + | can compare the recipient copy against the original source bytes. | |
| + | ||
| + | Safe L3 application skips conservative protected regions: | |
| + | ||
| + | - RFC 2119 / BCP 14 requirement keywords such as `MUST`, `SHOULD`, and `MAY` | |
| + | - numerical values with units or percentages | |
| + | - quoted text, inline code, code blocks, and indented code | |
| + | - ALL-CAPS defined terms | |
| + | - likely source-code, SQL, log, and structured-data inputs | |
| + | ||
| + | `boilerplate` L3 mode marks only header/footer/cover-page style regions and is | |
| + | the preferred mode when a user wants a semantic signal for contracts or | |
| + | filings without changing the body text. | |
| + | ||
| + | ## Collusion Threat Model | |
| + | ||
| + | L3 synonym choices are deterministic per mark ID. If multiple recipients | |
| + | collude and compare their copies, they can identify controlled vocabulary | |
| + | positions and may canonicalize those positions before leaking. That can defeat | |
| + | L3 attribution silently. Mitigations under evaluation: | |
| + | ||
| + | - per-recipient vocabulary randomization | |
| + | - stronger candidate scoring that models collusion edits | |
| + | - warnings or thresholds for large recipient sets before L3 is enabled | |
| + | ||
| + | Until those mitigations land, issuers should treat L3 as attribution evidence | |
| + | against ordinary leaks and low-to-medium effort stripping, not as a perfect | |
| + | collusion-resistant watermark. | |
| + | ||
| + | ## Passive Beacons | |
| + | ||
| + | Passive beacons are forensic telemetry, not a detection guarantee. Absence of | |
| + | a beacon does not prove absence of a leak. Corporate egress filtering, | |
| + | air-gapped readers, privacy tools, sandboxed previews, and offline workflows | |
| + | can suppress callbacks. | |
| + | ||
| + | ## Jurisdiction Policy | |
| + | ||
| + | Jurisdiction-by-IP is a soft policy control. It is useful for honest clients, | |
| + | audit trails, and routing decisions, but it is not a cryptographic security | |
| + | boundary. VPNs, proxies, and corporate NATs can defeat or blur IP geolocation. | |
| + | ||
| + | ## RFC 3161 Timestamps | |
| + | ||
| + | RFC 3161 timestamps prove a datum existed at or before the TSA signing time. | |
| + | They do not prove authorship. The TSA remains a trust anchor. Rekor / DSSE | |
| + | transparency reduces reliance on a single private timestamping service, but it | |
| + | does not eliminate timestamp trust entirely. |
| @@ -0,0 +1,117 @@ | ||
| + | # Oversight Registry v1 Interop Draft | |
| + | ||
| + | Status: draft; wire format is not stable until v1.0. | |
| + | ||
| + | This document defines the minimum interoperable registry surface for an | |
| + | independent Oversight registry operator. It follows OpenAPI 3.1 conventions for | |
| + | schema shape and keeps Oversight-specific policy out of the transport where | |
| + | possible. | |
| + | ||
| + | ## Goals | |
| + | ||
| + | - Let more than one operator run a compatible attribution registry. | |
| + | - Preserve issuer-signed manifest authority: request sidecars MUST match the | |
| + | manifest's signed `beacons` and `watermarks` arrays. | |
| + | - Keep beacon callbacks passive and authenticated between DNS/web beacon | |
| + | collectors and the registry. | |
| + | - Preserve local or public transparency-log evidence for every registration | |
| + | and event. | |
| + | ||
| + | ## Common Requirements | |
| + | ||
| + | - All JSON request bodies SHOULD be UTF-8 encoded. | |
| + | - Registries MUST reject unknown oversized identifiers. The reference limit is | |
| + | 256 bytes for `file_id`, `mark_id`, `token_id`, `recipient_id`, and | |
| + | `issuer_id`. | |
| + | - Registries MUST verify the Ed25519 signature on the manifest before writing | |
| + | beacons, watermarks, corpus hashes, Rekor entries, or tlog events. | |
| + | - Registries MUST NOT accept beacon or watermark sidecars that differ from the | |
| + | issuer-signed manifest copies. | |
| + | - DNS event callbacks from non-loopback clients MUST authenticate with | |
| + | `X-Oversight-DNS-Secret` or an equivalent deployment-specific channel. | |
| + | ||
| + | ## Endpoints | |
| + | ||
| + | | Method | Path | Purpose | | |
| + | |--------|------|---------| | |
| + | | `GET` | `/health` | Service health and tlog size | | |
| + | | `POST` | `/register` | Register signed manifest, beacons, watermarks, optional corpus hashes | | |
| + | | `POST` | `/attribute` | Look up attribution by `token_id`, `mark_id`, or perceptual/content hash | | |
| + | | `GET` | `/query/{file_id}` | Return manifest ownership plus registered beacons/watermarks | | |
| + | | `POST` | `/dns_event` | Authenticated DNS beacon callback | | |
| + | | `GET` | `/evidence/{file_id}` | Evidence bundle with manifest, events, tlog proofs, and signed tree head | | |
| + | ||
| + | ## `/register` | |
| + | ||
| + | Request: | |
| + | ||
| + | ```json | |
| + | { | |
| + | "manifest": {}, | |
| + | "beacons": [], | |
| + | "watermarks": [], | |
| + | "corpus": { | |
| + | "winnowing": "optional-hash", | |
| + | "sentence": "optional-hash" | |
| + | } | |
| + | } | |
| + | ``` | |
| + | ||
| + | Validation: | |
| + | ||
| + | 1. Canonicalize and verify `manifest.signature_ed25519`. | |
| + | 2. Compare `beacons` and `watermarks` against signed manifest arrays. | |
| + | 3. Reject malformed signed artifacts rather than silently dropping rows. | |
| + | 4. Append a registry transparency-log event. | |
| + | 5. If Rekor is enabled and a watermark mark ID exists, attest using | |
| + | `subject.name = "mark:<mark_id>"` and | |
| + | `subject.digest.sha256 = manifest.content_hash`. | |
| + | ||
| + | Response: | |
| + | ||
| + | ```json | |
| + | { | |
| + | "ok": true, | |
| + | "file_id": "uuid", | |
| + | "registered_beacons": 1, | |
| + | "tlog_index": 42, | |
| + | "rekor": {} | |
| + | } | |
| + | ``` | |
| + | ||
| + | ## `/dns_event` | |
| + | ||
| + | Request: | |
| + | ||
| + | ```json | |
| + | { | |
| + | "token_id": "hex-or-url-safe-token", | |
| + | "client_ip": "collector-observed-ip", | |
| + | "qtype": "A", | |
| + | "qname": "token.beacon.example" | |
| + | } | |
| + | ``` | |
| + | ||
| + | Security: | |
| + | ||
| + | - Public/non-loopback callbacks MUST include `X-Oversight-DNS-Secret`. | |
| + | - Registries SHOULD prefer collector-observed source metadata over | |
| + | user-controlled body fields when available. | |
| + | - Events SHOULD be appended to the local transparency log and included in | |
| + | evidence bundles. | |
| + | ||
| + | ## Evidence Bundle | |
| + | ||
| + | Evidence bundles SHOULD contain: | |
| + | ||
| + | - manifest JSON and signature | |
| + | - registry event rows | |
| + | - local tlog signed tree head | |
| + | - inclusion proof for every bundled tlog event | |
| + | - Rekor DSSE bundle, if public transparency was requested | |
| + | ||
| + | ## Federation Notes | |
| + | ||
| + | The wire format MUST NOT require the official `oversightprotocol.dev` domain. | |
| + | Operators may run their own registry and beacon domains as long as manifests | |
| + | declare the registry URL and beacon descriptors unambiguously. |
| @@ -14,7 +14,7 @@ Core: | ||
| from .container import seal, open_sealed, SealedFile | ||
| from .manifest import Manifest, Recipient, WatermarkRef | ||
| from .crypto import ClassicIdentity, random_dek, content_hash | ||
| - | from . import watermark, beacon | |
| + | from . import watermark, beacon, l3_policy | |
| __all__ = [ | ||
| "seal", | ||
| @@ -28,6 +28,7 @@ __all__ = [ | ||
| "content_hash", | ||
| "watermark", | ||
| "beacon", | ||
| + | "l3_policy", | |
| ] | ||
| - | __version__ = "0.4.4" | |
| + | __version__ = "0.4.5" |
| @@ -11,10 +11,10 @@ into a single apply/recover API. | ||
| from __future__ import annotations | ||
| - | from .. import watermark, semantic | |
| + | from .. import watermark, l3_policy, semantic | |
| - | def apply(text: str, mark_id: bytes, layers: tuple[str, ...] = ("L1", "L2", "L3")) -> str: | |
| + | def apply(text: str, mark_id: bytes, layers: tuple[str, ...] = ("L1", "L2")) -> str: | |
| """Apply all requested watermark layers to UTF-8 text. | ||
| Layer order matters: L3 rewrites visible words, so it must run before the | ||
| @@ -22,7 +22,7 @@ def apply(text: str, mark_id: bytes, layers: tuple[str, ...] = ("L1", "L2", "L3" | ||
| """ | ||
| t = text | ||
| if "L3" in layers: | ||
| - | t = semantic.apply_semantic(t, mark_id) | |
| + | t = l3_policy.apply_l3_safe(t, mark_id, mode="full") | |
| if "L2" in layers: | ||
| t = watermark.embed_ws(t, mark_id) | ||
| if "L1" in layers: |
| @@ -0,0 +1,214 @@ | ||
| + | """ | |
| + | L3 semantic-watermark safety policy. | |
| + | ||
| + | L3 is powerful because it changes visible prose. That also makes it unsafe for | |
| + | classes where exact wording is the evidence: contracts, filings, code, logs, | |
| + | structured data, and technical specifications. This module decides when L3 is | |
| + | allowed and applies it only to conservative prose regions. | |
| + | """ | |
| + | ||
| + | from __future__ import annotations | |
| + | ||
| + | from dataclasses import dataclass, asdict | |
| + | from pathlib import Path | |
| + | import re | |
| + | from typing import Optional | |
| + | ||
| + | from . import semantic | |
| + | ||
| + | ||
| + | RISKY_EXTENSIONS = { | |
| + | ".c", ".cc", ".cpp", ".cs", ".css", ".go", ".h", ".hpp", ".java", | |
| + | ".js", ".jsx", ".kt", ".lua", ".php", ".py", ".rb", ".rs", ".sh", | |
| + | ".sql", ".swift", ".ts", ".tsx", | |
| + | ".json", ".jsonl", ".yaml", ".yml", ".toml", ".xml", ".csv", ".tsv", | |
| + | ".ini", ".conf", ".cfg", ".lock", ".env", | |
| + | ".log", | |
| + | } | |
| + | LEGAL_EXTENSIONS = {".contract", ".filing", ".nda", ".msa", ".sow"} | |
| + | STRUCTURED_MIME_PREFIXES = ( | |
| + | "application/json", | |
| + | "application/xml", | |
| + | "application/x-yaml", | |
| + | "text/csv", | |
| + | "text/tab-separated-values", | |
| + | ) | |
| + | SOURCE_MIME_HINTS = ("source", "script", "sql", "json", "yaml", "xml") | |
| + | RFC2119 = { | |
| + | "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", | |
| + | "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", "OPTIONAL", | |
| + | } | |
| + | ||
| + | ||
| + | @dataclass | |
| + | class L3Decision: | |
| + | enabled: bool | |
| + | mode: str | |
| + | document_class: str | |
| + | requires_ack: bool | |
| + | reason: str | |
| + | warnings: list[str] | |
| + | ||
| + | def to_dict(self) -> dict: | |
| + | return asdict(self) | |
| + | ||
| + | ||
| + | def classify_document( | |
| + | *, | |
| + | filename: str = "", | |
| + | content_type: str = "", | |
| + | text: str = "", | |
| + | declared_class: str = "auto", | |
| + | ) -> tuple[str, list[str]]: | |
| + | """Classify a document for L3 safety decisions.""" | |
| + | if declared_class and declared_class != "auto": | |
| + | return declared_class, [f"declared document class: {declared_class}"] | |
| + | ||
| + | suffix = Path(filename).suffix.lower() | |
| + | ctype = (content_type or "").lower() | |
| + | sample = text[:8192] | |
| + | reasons: list[str] = [] | |
| + | ||
| + | if suffix in LEGAL_EXTENSIONS: | |
| + | return "legal", [f"legal-sensitive extension {suffix}"] | |
| + | if suffix in RISKY_EXTENSIONS: | |
| + | if suffix in {".sql"}: | |
| + | return "sql", [f"SQL extension {suffix}"] | |
| + | if suffix == ".log": | |
| + | return "log", [f"log extension {suffix}"] | |
| + | if suffix in {".json", ".jsonl", ".yaml", ".yml", ".toml", ".xml", ".csv", ".tsv", ".ini", ".conf", ".cfg", ".lock", ".env"}: | |
| + | return "structured_data", [f"structured-data extension {suffix}"] | |
| + | return "source_code", [f"source-code extension {suffix}"] | |
| + | ||
| + | if any(ctype.startswith(p) for p in STRUCTURED_MIME_PREFIXES): | |
| + | return "structured_data", [f"structured MIME type {content_type}"] | |
| + | if any(h in ctype for h in SOURCE_MIME_HINTS): | |
| + | return "source_code", [f"code-like MIME type {content_type}"] | |
| + | ||
| + | upper_hits = sum(1 for kw in RFC2119 if re.search(rf"\b{re.escape(kw)}\b", sample)) | |
| + | if upper_hits >= 3: | |
| + | return "technical_spec", ["multiple RFC 2119 requirement keywords"] | |
| + | if re.search(r"\b(SEC|FDA|FINRA|10-K|10-Q|8-K|S-1|regulation|compliance filing)\b", sample, re.I): | |
| + | return "regulatory", ["regulatory/filing language detected"] | |
| + | if re.search(r"\b(agreement|whereas|hereby|indemnif|governing law|jurisdiction|party|parties)\b", sample, re.I): | |
| + | return "legal", ["contract/legal language detected"] | |
| + | if re.search(r"```|^\s{4,}\S|SELECT\s+.+\s+FROM|CREATE\s+TABLE", sample, re.I | re.M): | |
| + | return "technical_spec", ["code block or specification-like syntax detected"] | |
| + | ||
| + | reasons.append("no high-risk L3 signals detected") | |
| + | return "prose", reasons | |
| + | ||
| + | ||
| + | def decide_l3( | |
| + | *, | |
| + | filename: str = "", | |
| + | content_type: str = "", | |
| + | text: str = "", | |
| + | declared_class: str = "auto", | |
| + | requested_mode: str = "auto", | |
| + | ) -> L3Decision: | |
| + | """Return whether L3 should run and how.""" | |
| + | doc_class, reasons = classify_document( | |
| + | filename=filename, | |
| + | content_type=content_type, | |
| + | text=text, | |
| + | declared_class=declared_class, | |
| + | ) | |
| + | risky = doc_class in { | |
| + | "legal", "regulatory", "technical_spec", "source_code", "sql", | |
| + | "log", "structured_data", | |
| + | } | |
| + | warnings: list[str] = [] | |
| + | ||
| + | if requested_mode == "off": | |
| + | return L3Decision(False, "off", doc_class, False, "L3 disabled by user", reasons) | |
| + | if requested_mode == "boilerplate": | |
| + | return L3Decision(True, "boilerplate", doc_class, True, "boilerplate-only L3 requested", reasons) | |
| + | if requested_mode == "full": | |
| + | if risky: | |
| + | warnings.append( | |
| + | "L3 full mode was explicitly requested for a wording-sensitive document class." | |
| + | ) | |
| + | return L3Decision(True, "full", doc_class, True, "full L3 explicitly requested", reasons + warnings) | |
| + | ||
| + | if risky: | |
| + | return L3Decision( | |
| + | False, | |
| + | "off", | |
| + | doc_class, | |
| + | False, | |
| + | "L3 defaults off for wording-sensitive document classes", | |
| + | reasons, | |
| + | ) | |
| + | ||
| + | return L3Decision(True, "full", doc_class, True, "L3 auto-enabled for prose", reasons) | |
| + | ||
| + | ||
| + | def apply_l3_safe(text: str, mark_id: bytes, mode: str = "full") -> str: | |
| + | """Apply L3 only to conservative prose regions.""" | |
| + | if mode == "off": | |
| + | return text | |
| + | ||
| + | lines = text.splitlines(keepends=True) | |
| + | code_fence = False | |
| + | out: list[str] = [] | |
| + | total = len(lines) | |
| + | ||
| + | for idx, line in enumerate(lines): | |
| + | stripped = line.strip() | |
| + | if stripped.startswith("```"): | |
| + | code_fence = not code_fence | |
| + | out.append(line) | |
| + | continue | |
| + | if code_fence or _line_is_protected(line): | |
| + | out.append(line) | |
| + | continue | |
| + | if mode == "boilerplate" and not _is_boilerplate_line(line, idx, total): | |
| + | out.append(line) | |
| + | continue | |
| + | out.append(_apply_l3_to_unquoted_segments(line, mark_id)) | |
| + | return "".join(out) | |
| + | ||
| + | ||
| + | def _line_is_protected(line: str) -> bool: | |
| + | stripped = line.strip() | |
| + | if not stripped: | |
| + | return False | |
| + | if line.startswith((" ", "\t", ">>> ", "... ")): | |
| + | return True | |
| + | if re.match(r"^\s*(SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP)\b", line, re.I): | |
| + | return True | |
| + | if re.search(r"`[^`]+`", line): | |
| + | return True | |
| + | if re.search(r"\b(?:MUST|SHOULD|MAY|SHALL|REQUIRED|OPTIONAL)(?:\s+NOT)?\b", line): | |
| + | return True | |
| + | if re.search(r"\b\d+(?:\.\d+)?\s*(?:%|percent|kg|g|mg|lb|oz|m|cm|mm|km|ft|in|ms|s|sec|min|h|hr|USD|EUR|GBP|MB|GB|TB)\b", line, re.I): | |
| + | return True | |
| + | if re.search(r"\b[A-Z][A-Z0-9_-]{2,}\b", line): | |
| + | return True | |
| + | return False | |
| + | ||
| + | ||
| + | def _is_boilerplate_line(line: str, idx: int, total: int) -> bool: | |
| + | if idx < 6 or idx >= max(0, total - 6): | |
| + | return True | |
| + | return bool(re.search(r"\b(confidential|proprietary|notice|copyright|footer|header|cover page)\b", line, re.I)) | |
| + | ||
| + | ||
| + | def _apply_l3_to_unquoted_segments(line: str, mark_id: bytes) -> str: | |
| + | parts = re.split(r"((?:\"[^\"]*\")|(?:'[^']*')|(?:โ[^โ]*โ))", line) | |
| + | for i in range(0, len(parts), 2): | |
| + | segment = parts[i] | |
| + | if not segment.strip(): | |
| + | continue | |
| + | # Safe L3 avoids number-format marks entirely and only transforms prose | |
| + | # segments that passed the line-level guards. | |
| + | segment = ( | |
| + | semantic.embed_synonyms_v2(segment, mark_id, min_instances=1) | |
| + | if semantic.SYNONYMS_V2_AVAILABLE | |
| + | else semantic.embed_synonyms(segment, mark_id, min_instances=1) | |
| + | ) | |
| + | segment = semantic.embed_spelling(segment, mark_id) | |
| + | segment = semantic.embed_contractions(segment, mark_id) | |
| + | parts[i] = segment | |
| + | return "".join(parts) |
| @@ -45,6 +45,7 @@ class Manifest: | ||
| # file properties | ||
| original_filename: str = "" | ||
| content_hash: str = "" # sha256 of plaintext | ||
| + | canonical_content_hash: str = "" # sha256 of source before semantic/L1/L2 marks | |
| content_type: str = "application/octet-stream" | ||
| size_bytes: int = 0 | ||
| @@ -61,6 +62,7 @@ class Manifest: | ||
| # policy | ||
| policy: dict = field(default_factory=dict) | ||
| + | l3_policy: dict = field(default_factory=dict) | |
| # policy fields (opt): | ||
| # not_after: int (unix) | ||
| # max_opens: int | ||
| @@ -103,6 +105,7 @@ class Manifest: | ||
| issued_at=int(time.time()), | ||
| original_filename=original_filename, | ||
| content_hash=content_hash, | ||
| + | canonical_content_hash=content_hash, | |
| content_type=content_type, | ||
| size_bytes=size_bytes, | ||
| issuer_id=issuer_id, |
| @@ -256,6 +256,11 @@ def iter_matchable_words(text: str) -> Iterator[tuple[int, int, str, tuple[int, | ||
| # Skip if any part of the word is inside a skip region | ||
| if any(skip_mask[i] for i in range(m.start(), m.end())): | ||
| continue | ||
| + | # Conservative L3 safety: do not alter ALL-CAPS defined terms or | |
| + | # capitalized words that may be proper nouns. | |
| + | word = m.group(1) | |
| + | if word.isupper() or (word[:1].isupper() and m.start() != 0): | |
| + | continue | |
| key = m.group(1).lower() | ||
| if key in _LOOKUP: | ||
| - | yield m.start(), m.end(), m.group(1), _LOOKUP[key] | |
| + | yield m.start(), m.end(), word, _LOOKUP[key] |
| @@ -211,7 +211,13 @@ def extract_ws_partial( | ||
| # ---------------- high-level apply/recover ---------------- | ||
| - | def apply_all(text: str, mark_id: bytes) -> str: | |
| + | def apply_all( | |
| + | text: str, | |
| + | mark_id: bytes, | |
| + | *, | |
| + | include_l3: bool = False, | |
| + | l3_mode: str = "full", | |
| + | ) -> str: | |
| """ | ||
| Apply all available watermark layers to text. | ||
| @@ -220,8 +226,9 @@ def apply_all(text: str, mark_id: bytes) -> str: | ||
| last because it inserts invisible characters that could fragment synonym | ||
| words if applied earlier. | ||
| """ | ||
| - | if _L3_AVAILABLE: | |
| - | t = _semantic.apply_semantic(text, mark_id) | |
| + | if include_l3 and _L3_AVAILABLE: | |
| + | from . import l3_policy | |
| + | t = l3_policy.apply_l3_safe(text, mark_id, mode=l3_mode) | |
| else: | ||
| t = text | ||
| t = embed_ws(t, mark_id) |
| @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" | ||
| [project] | ||
| name = "oversight-protocol" | ||
| - | version = "0.4.4" | |
| + | version = "0.4.5" | |
| description = "Open protocol for cryptographic data provenance, recipient attribution, and leak detection." | ||
| readme = "README.md" | ||
| license = {text = "Apache-2.0"} | ||
| @@ -53,6 +53,7 @@ all = ["oversight-protocol[registry,formats]"] | ||
| [project.scripts] | ||
| oversight = "cli.oversight_rich:main" | ||
| + | oversight-gui = "cli.gui:main" | |
| [project.urls] | ||
| Homepage = "https://oversight-protocol.github.io/oversight/" |
| @@ -0,0 +1,65 @@ | ||
| + | #!/usr/bin/env python3 | |
| + | """Focused tests for L3 safety policy.""" | |
| + | ||
| + | import os | |
| + | import sys | |
| + | ||
| + | ROOT = os.path.join(os.path.dirname(__file__), "..") | |
| + | sys.path.insert(0, ROOT) | |
| + | ||
| + | from oversight_core import l3_policy, watermark | |
| + | ||
| + | ||
| + | def ok(msg: str) -> None: | |
| + | print(f" [PASS] {msg}") | |
| + | ||
| + | ||
| + | def t1_risky_documents_default_l3_off(): | |
| + | text = "The system MUST verify every request. SELECT * FROM users;" | |
| + | decision = l3_policy.decide_l3( | |
| + | filename="api-spec.md", | |
| + | content_type="text/markdown", | |
| + | text=text, | |
| + | requested_mode="auto", | |
| + | ) | |
| + | assert not decision.enabled | |
| + | assert decision.document_class == "technical_spec" | |
| + | ok("technical/spec content disables L3 by default") | |
| + | ||
| + | ||
| + | def t2_full_l3_requires_ack_metadata(): | |
| + | decision = l3_policy.decide_l3( | |
| + | filename="brief.txt", | |
| + | content_type="text/plain", | |
| + | text="This report will begin with a large review and explain the issue.", | |
| + | requested_mode="full", | |
| + | ) | |
| + | assert decision.enabled | |
| + | assert decision.requires_ack | |
| + | assert decision.mode == "full" | |
| + | ok("explicit full L3 returns acknowledgement-required decision") | |
| + | ||
| + | ||
| + | def t3_safe_l3_preserves_protected_lines(): | |
| + | mark_id = watermark.new_mark_id() | |
| + | original = ( | |
| + | "The Vendor MUST provide 5 kg by Friday.\n" | |
| + | "This report will begin with a large review and explain the issue for Alice.\n" | |
| + | " SELECT * FROM users;\n" | |
| + | ) | |
| + | marked = l3_policy.apply_l3_safe(original, mark_id, mode="full") | |
| + | assert "The Vendor MUST provide 5 kg by Friday." in marked | |
| + | assert "Alice" in marked | |
| + | assert " SELECT * FROM users;" in marked | |
| + | assert marked != original | |
| + | ok("safe L3 preserves RFC2119/numeric/code lines while marking prose") | |
| + | ||
| + | ||
| + | if __name__ == "__main__": | |
| + | print("=" * 60) | |
| + | print("oversight_core.l3_policy - focused unit tests") | |
| + | print("=" * 60) | |
| + | t1_risky_documents_default_l3_off() | |
| + | t2_full_l3_requires_ack_metadata() | |
| + | t3_safe_l3_preserves_protected_lines() | |
| + | print("\n ALL TESTS PASSED - 3/3") |
| @@ -22,10 +22,10 @@ def t1_text_adapter_matches_core_order(): | ||
| "A second paragraph helps the semantic watermark choose visible variants." | ||
| ) | ||
| mark_id = watermark.new_mark_id() | ||
| - | via_adapter = text_format.apply(original, mark_id) | |
| - | via_core = watermark.apply_all(original, mark_id) | |
| + | via_adapter = text_format.apply(original, mark_id, layers=("L1", "L2", "L3")) | |
| + | via_core = watermark.apply_all(original, mark_id, include_l3=True) | |
| assert via_adapter == via_core, "text adapter diverged from core watermark order" | ||
| - | print(" [PASS] text adapter applies L3/L2/L1 in the same order as the core pipeline") | |
| + | print(" [PASS] text adapter applies explicit L3/L2/L1 in the same order as the core pipeline") | |
| def main(): |