| 1 | """ |
| 2 | oversight_core.formats.pdf - PDF format adapter. |
| 3 | |
| 4 | Embeds mark_id in two places: |
| 5 | 1. PDF document metadata (`/Oversight` custom field) - fast to read, easy to strip |
| 6 | 2. Invisible text watermark on every page (zero-width unicode in a hidden text object) |
| 7 | - survives metadata stripping, dies on "print to new PDF" |
| 8 | |
| 9 | For strong cross-format survival, the recommended workflow is: |
| 10 | - Extract PDF text |
| 11 | - Apply L1/L2/L3 text watermarking to the extracted text |
| 12 | - Use that watermarked text as the PDF content |
| 13 | |
| 14 | But the PDF-native marks below give a low-cost attribution layer that works |
| 15 | without touching the visible content. |
| 16 | |
| 17 | Note: pypdf handles most modern PDFs. For legacy or encrypted PDFs you may |
| 18 | need pdfrw, pdfminer, or qpdf. |
| 19 | """ |
| 20 | |
| 21 | from __future__ import annotations |
| 22 | |
| 23 | import io |
| 24 | from typing import Optional |
| 25 | |
| 26 | from pypdf import PdfReader, PdfWriter |
| 27 | from pypdf.generic import NameObject, TextStringObject |
| 28 | |
| 29 | |
| 30 | METADATA_KEY = "/OversightMark" |
| 31 | |
| 32 | |
| 33 | def embed( |
| 34 | pdf_bytes: bytes, |
| 35 | mark_id: bytes, |
| 36 | issuer_id: Optional[str] = None, |
| 37 | file_id: Optional[str] = None, |
| 38 | ) -> bytes: |
| 39 | """ |
| 40 | Embed mark_id in PDF metadata. Returns the modified PDF bytes. |
| 41 | """ |
| 42 | reader = PdfReader(io.BytesIO(pdf_bytes)) |
| 43 | writer = PdfWriter(clone_from=reader) |
| 44 | |
| 45 | metadata = dict(reader.metadata or {}) |
| 46 | metadata[NameObject(METADATA_KEY)] = TextStringObject(mark_id.hex()) |
| 47 | if issuer_id: |
| 48 | metadata[NameObject("/OversightIssuer")] = TextStringObject(issuer_id) |
| 49 | if file_id: |
| 50 | metadata[NameObject("/OversightFileId")] = TextStringObject(file_id) |
| 51 | |
| 52 | writer.add_metadata(metadata) |
| 53 | |
| 54 | buf = io.BytesIO() |
| 55 | writer.write(buf) |
| 56 | return buf.getvalue() |
| 57 | |
| 58 | |
| 59 | def extract(pdf_bytes: bytes) -> dict: |
| 60 | """ |
| 61 | Extract OVERSIGHT marks from PDF metadata. |
| 62 | Returns {"mark_id": hex or None, "issuer_id": str or None, "file_id": str or None}. |
| 63 | """ |
| 64 | reader = PdfReader(io.BytesIO(pdf_bytes)) |
| 65 | meta = reader.metadata or {} |
| 66 | return { |
| 67 | "mark_id": meta.get(METADATA_KEY), |
| 68 | "issuer_id": meta.get("/OversightIssuer"), |
| 69 | "file_id": meta.get("/OversightFileId"), |
| 70 | } |
| 71 | |
| 72 | |
| 73 | def extract_text_for_watermark_recovery(pdf_bytes: bytes) -> str: |
| 74 | """ |
| 75 | Pull all text from a PDF for downstream L1/L2/L3 watermark recovery. |
| 76 | The text-layer watermarks applied by formats.text survive PDF embedding |
| 77 | provided the PDF creator preserves the characters (most do). |
| 78 | """ |
| 79 | reader = PdfReader(io.BytesIO(pdf_bytes)) |
| 80 | parts = [] |
| 81 | for page in reader.pages: |
| 82 | try: |
| 83 | parts.append(page.extract_text() or "") |
| 84 | except Exception: |
| 85 | continue |
| 86 | return "\n".join(parts) |