| 1 | """ |
| 2 | oversight_core.formats.text - text format adapter. |
| 3 | |
| 4 | Wraps the three watermark layers: |
| 5 | L1 zero-width unicode (watermark.py) |
| 6 | L2 trailing whitespace (watermark.py) |
| 7 | L3 semantic (semantic.py) |
| 8 | |
| 9 | into a single apply/recover API. |
| 10 | """ |
| 11 | |
| 12 | from __future__ import annotations |
| 13 | |
| 14 | from .. import watermark, l3_policy, semantic |
| 15 | |
| 16 | |
| 17 | def apply(text: str, mark_id: bytes, layers: tuple[str, ...] = ("L1", "L2")) -> str: |
| 18 | """Apply all requested watermark layers to UTF-8 text. |
| 19 | |
| 20 | Layer order matters: L3 rewrites visible words, so it must run before the |
| 21 | L2/L1 steganographic layers that append whitespace and zero-width chars. |
| 22 | """ |
| 23 | t = text |
| 24 | if "L3" in layers: |
| 25 | t = l3_policy.apply_l3_safe(t, mark_id, mode="full") |
| 26 | if "L2" in layers: |
| 27 | t = watermark.embed_ws(t, mark_id) |
| 28 | if "L1" in layers: |
| 29 | t = watermark.embed_zw(t, mark_id) |
| 30 | return t |
| 31 | |
| 32 | |
| 33 | def recover(text: str, candidate_mark_ids: list[bytes] = None) -> dict: |
| 34 | """ |
| 35 | Recover attribution from text. |
| 36 | |
| 37 | Returns: |
| 38 | { |
| 39 | "L1_hits": [mark_id_hex, ...], |
| 40 | "L2_hits": [mark_id_hex, ...], |
| 41 | "L3_matches": [{"mark_id": ..., "score": ..., "match": True/False}, ...] |
| 42 | } |
| 43 | |
| 44 | L1 and L2 recover the mark_id directly from invisible content. |
| 45 | L3 requires candidate_mark_ids (usually from the registry) to verify against. |
| 46 | """ |
| 47 | out = { |
| 48 | "L1_hits": [m.hex() for m in watermark.extract_zw(text)], |
| 49 | "L2_hits": [], |
| 50 | "L3_matches": [], |
| 51 | } |
| 52 | ws = watermark.extract_ws(text) |
| 53 | if ws: |
| 54 | out["L2_hits"].append(ws.hex()) |
| 55 | |
| 56 | if candidate_mark_ids: |
| 57 | for cm in candidate_mark_ids: |
| 58 | result = semantic.verify_semantic(text, cm) |
| 59 | if result["overall_match"]: |
| 60 | out["L3_matches"].append({ |
| 61 | "mark_id": cm.hex(), |
| 62 | "syn_score": result["synonyms_score"], |
| 63 | "punct_score": result["punctuation_score"], |
| 64 | }) |
| 65 | return out |