Zion Boggan
repos/Oversight/oversight_core/formats/pdf.py
zionboggan.com ↗
86 lines · python
History for this file →
1
"""
2
oversight_core.formats.pdf - PDF format adapter.
3
 
4
Embeds mark_id in two places:
5
  1. PDF document metadata (`/Oversight` custom field) - fast to read, easy to strip
6
  2. Invisible text watermark on every page (zero-width unicode in a hidden text object)
7
     - survives metadata stripping, dies on "print to new PDF"
8
 
9
For strong cross-format survival, the recommended workflow is:
10
  - Extract PDF text
11
  - Apply L1/L2/L3 text watermarking to the extracted text
12
  - Use that watermarked text as the PDF content
13
 
14
But the PDF-native marks below give a low-cost attribution layer that works
15
without touching the visible content.
16
 
17
Note: pypdf handles most modern PDFs. For legacy or encrypted PDFs you may
18
need pdfrw, pdfminer, or qpdf.
19
"""
20
 
21
from __future__ import annotations
22
 
23
import io
24
from typing import Optional
25
 
26
from pypdf import PdfReader, PdfWriter
27
from pypdf.generic import NameObject, TextStringObject
28
 
29
 
30
METADATA_KEY = "/OversightMark"
31
 
32
 
33
def embed(
34
    pdf_bytes: bytes,
35
    mark_id: bytes,
36
    issuer_id: Optional[str] = None,
37
    file_id: Optional[str] = None,
38
) -> bytes:
39
    """
40
    Embed mark_id in PDF metadata. Returns the modified PDF bytes.
41
    """
42
    reader = PdfReader(io.BytesIO(pdf_bytes))
43
    writer = PdfWriter(clone_from=reader)
44
 
45
    metadata = dict(reader.metadata or {})
46
    metadata[NameObject(METADATA_KEY)] = TextStringObject(mark_id.hex())
47
    if issuer_id:
48
        metadata[NameObject("/OversightIssuer")] = TextStringObject(issuer_id)
49
    if file_id:
50
        metadata[NameObject("/OversightFileId")] = TextStringObject(file_id)
51
 
52
    writer.add_metadata(metadata)
53
 
54
    buf = io.BytesIO()
55
    writer.write(buf)
56
    return buf.getvalue()
57
 
58
 
59
def extract(pdf_bytes: bytes) -> dict:
60
    """
61
    Extract OVERSIGHT marks from PDF metadata.
62
    Returns {"mark_id": hex or None, "issuer_id": str or None, "file_id": str or None}.
63
    """
64
    reader = PdfReader(io.BytesIO(pdf_bytes))
65
    meta = reader.metadata or {}
66
    return {
67
        "mark_id": meta.get(METADATA_KEY),
68
        "issuer_id": meta.get("/OversightIssuer"),
69
        "file_id": meta.get("/OversightFileId"),
70
    }
71
 
72
 
73
def extract_text_for_watermark_recovery(pdf_bytes: bytes) -> str:
74
    """
75
    Pull all text from a PDF for downstream L1/L2/L3 watermark recovery.
76
    The text-layer watermarks applied by formats.text survive PDF embedding
77
    provided the PDF creator preserves the characters (most do).
78
    """
79
    reader = PdfReader(io.BytesIO(pdf_bytes))
80
    parts = []
81
    for page in reader.pages:
82
        try:
83
            parts.append(page.extract_text() or "")
84
        except Exception:
85
            continue
86
    return "\n".join(parts)