oversight_core/formats/pdf.py

86 lines · python

"""
oversight_core.formats.pdf - PDF format adapter.
 
Embeds mark_id in two places:
  1. PDF document metadata (`/Oversight` custom field) - fast to read, easy to strip
  2. Invisible text watermark on every page (zero-width unicode in a hidden text object)
     - survives metadata stripping, dies on "print to new PDF"
 
For strong cross-format survival, the recommended workflow is:
  - Extract PDF text
  - Apply L1/L2/L3 text watermarking to the extracted text
  - Use that watermarked text as the PDF content
 
But the PDF-native marks below give a low-cost attribution layer that works
without touching the visible content.
 
Note: pypdf handles most modern PDFs. For legacy or encrypted PDFs you may
need pdfrw, pdfminer, or qpdf.
"""
 
from __future__ import annotations
 
import io
from typing import Optional
 
from pypdf import PdfReader, PdfWriter
from pypdf.generic import NameObject, TextStringObject
 
 
METADATA_KEY = "/OversightMark"
 
 
def embed(
    pdf_bytes: bytes,
    mark_id: bytes,
    issuer_id: Optional[str] = None,
    file_id: Optional[str] = None,
) -> bytes:
    """
    Embed mark_id in PDF metadata. Returns the modified PDF bytes.
    """
    reader = PdfReader(io.BytesIO(pdf_bytes))
    writer = PdfWriter(clone_from=reader)
 
    metadata = dict(reader.metadata or {})
    metadata[NameObject(METADATA_KEY)] = TextStringObject(mark_id.hex())
    if issuer_id:
        metadata[NameObject("/OversightIssuer")] = TextStringObject(issuer_id)
    if file_id:
        metadata[NameObject("/OversightFileId")] = TextStringObject(file_id)
 
    writer.add_metadata(metadata)
 
    buf = io.BytesIO()
    writer.write(buf)
    return buf.getvalue()
 
 
def extract(pdf_bytes: bytes) -> dict:
    """
    Extract OVERSIGHT marks from PDF metadata.
    Returns {"mark_id": hex or None, "issuer_id": str or None, "file_id": str or None}.
    """
    reader = PdfReader(io.BytesIO(pdf_bytes))
    meta = reader.metadata or {}
    return {
        "mark_id": meta.get(METADATA_KEY),
        "issuer_id": meta.get("/OversightIssuer"),
        "file_id": meta.get("/OversightFileId"),
    }
 
 
def extract_text_for_watermark_recovery(pdf_bytes: bytes) -> str:
    """
    Pull all text from a PDF for downstream L1/L2/L3 watermark recovery.
    The text-layer watermarks applied by formats.text survive PDF embedding
    provided the PDF creator preserves the characters (most do).
    """
    reader = PdfReader(io.BytesIO(pdf_bytes))
    parts = []
    for page in reader.pages:
        try:
            parts.append(page.extract_text() or "")
        except Exception:
            continue
    return "\n".join(parts)

1	"""
2	oversight_core.formats.pdf - PDF format adapter.
3
4	Embeds mark_id in two places:
5	1. PDF document metadata (`/Oversight` custom field) - fast to read, easy to strip
6	2. Invisible text watermark on every page (zero-width unicode in a hidden text object)
7	- survives metadata stripping, dies on "print to new PDF"
8
9	For strong cross-format survival, the recommended workflow is:
10	- Extract PDF text
11	- Apply L1/L2/L3 text watermarking to the extracted text
12	- Use that watermarked text as the PDF content
13
14	But the PDF-native marks below give a low-cost attribution layer that works
15	without touching the visible content.
16
17	Note: pypdf handles most modern PDFs. For legacy or encrypted PDFs you may
18	need pdfrw, pdfminer, or qpdf.
19	"""
20
21	from __future__ import annotations
22
23	import io
24	from typing import Optional
25
26	from pypdf import PdfReader, PdfWriter
27	from pypdf.generic import NameObject, TextStringObject
28
29
30	METADATA_KEY = "/OversightMark"
31
32
33	def embed(
34	pdf_bytes: bytes,
35	mark_id: bytes,
36	issuer_id: Optional[str] = None,
37	file_id: Optional[str] = None,
38	) -> bytes:
39	"""
40	Embed mark_id in PDF metadata. Returns the modified PDF bytes.
41	"""
42	reader = PdfReader(io.BytesIO(pdf_bytes))
43	writer = PdfWriter(clone_from=reader)
44
45	metadata = dict(reader.metadata or {})
46	metadata[NameObject(METADATA_KEY)] = TextStringObject(mark_id.hex())
47	if issuer_id:
48	metadata[NameObject("/OversightIssuer")] = TextStringObject(issuer_id)
49	if file_id:
50	metadata[NameObject("/OversightFileId")] = TextStringObject(file_id)
51
52	writer.add_metadata(metadata)
53
54	buf = io.BytesIO()
55	writer.write(buf)
56	return buf.getvalue()
57
58
59	def extract(pdf_bytes: bytes) -> dict:
60	"""
61	Extract OVERSIGHT marks from PDF metadata.
62	Returns {"mark_id": hex or None, "issuer_id": str or None, "file_id": str or None}.
63	"""
64	reader = PdfReader(io.BytesIO(pdf_bytes))
65	meta = reader.metadata or {}
66	return {
67	"mark_id": meta.get(METADATA_KEY),
68	"issuer_id": meta.get("/OversightIssuer"),
69	"file_id": meta.get("/OversightFileId"),
70	}
71
72
73	def extract_text_for_watermark_recovery(pdf_bytes: bytes) -> str:
74	"""
75	Pull all text from a PDF for downstream L1/L2/L3 watermark recovery.
76	The text-layer watermarks applied by formats.text survive PDF embedding
77	provided the PDF creator preserves the characters (most do).
78	"""
79	reader = PdfReader(io.BytesIO(pdf_bytes))
80	parts = []
81	for page in reader.pages:
82	try:
83	parts.append(page.extract_text() or "")
84	except Exception:
85	continue
86	return "\n".join(parts)