| @@ -1,28 +1,103 @@ | ||
| - | # Oversight v0.5 | |
| + | # Oversight Protocol | |
| - | **Open protocol + reference implementation for data provenance, attribution, and leak detection.** | |
| + | **Open protocol for cryptographic data provenance, recipient attribution, and leak detection.** | |
| - | Format-agnostic. Post-quantum-verified (ML-KEM-768 + ML-DSA-65 via liboqs). Jurisdiction-aware. Fully passive - no code execution on readers, no RATs, no defensive malware. | |
| + | Co-authored by Zion Boggan and Claude Opus 4.6/4.7 (Anthropic). | |
| - | **Truly open source.** No cloud vendor lock-in. No paid service required. No custom cryptography. Every primitive is NIST-standardized and publicly auditable. | |
| + | Format-agnostic. Post-quantum ready (ML-KEM-768 + ML-DSA-65). Three-layer watermarking that survives format conversion, invisible-char stripping, and screenshot/OCR. Content fingerprinting that identifies leaked copies even when all watermarks are destroyed. | |
| + | ||
| + | No cloud vendor lock-in. No paid service required. No custom cryptography. Apache 2.0. | |
| + | ||
| + | **Website:** https://oversight-protocol.github.io/oversight/ | |
| --- | ||
| - | ## What's new in v0.4 | |
| + | ## Install | |
| + | ||
| + | Requires Python 3.10+. | |
| + | ||
| + | ```bash | |
| + | # Clone the repo | |
| + | git clone https://github.com/oversight-protocol/oversight.git | |
| + | cd oversight | |
| + | ||
| + | # Install (adds the `oversight` command to your PATH) | |
| + | pip install . | |
| + | ||
| + | # Verify | |
| + | oversight status | |
| + | ``` | |
| + | ||
| + | That's it. The `oversight` command is now available globally. | |
| + | ||
| + | ### Optional extras | |
| + | ||
| + | ```bash | |
| + | # Include registry server (FastAPI) | |
| + | pip install ".[registry]" | |
| + | ||
| + | # Include format adapters (PDF, DOCX, image watermarking) | |
| + | pip install ".[formats]" | |
| + | ||
| + | # Everything | |
| + | pip install ".[all]" | |
| + | ``` | |
| + | ||
| + | ## Quick start | |
| + | ||
| + | ```bash | |
| + | # 1. Initialize a project directory | |
| + | mkdir my-project && cd my-project | |
| + | oversight init | |
| + | ||
| + | # 2. Generate your issuer identity | |
| + | oversight keys generate --name zion | |
| + | ||
| + | # 3. Generate a recipient identity (they would do this on their machine) | |
| + | oversight keys generate --name alice --out alice.json | |
| + | ||
| + | # 4. Import the recipient's public key | |
| + | oversight keys import alice.pub.json | |
| + | ||
| + | # 5. Seal a document to the recipient (watermarks embedded by default) | |
| + | oversight seal report.txt --to alice | |
| + | ||
| + | # 6. The recipient opens the sealed file | |
| + | oversight open report.txt.sealed --out report-decrypted.txt | |
| + | ||
| + | # 7. If the document leaks, attribute it | |
| + | oversight attribute --leak leaked.txt --fingerprints .oversight/fingerprints | |
| + | ``` | |
| + | ||
| + | ### What happens when you seal | |
| + | ||
| + | The seal command applies three watermark layers to the document, each targeting a different attack surface: | |
| + | ||
| + | - **L1** inserts zero-width Unicode characters (survives copy-paste) | |
| + | - **L2** encodes bits in trailing whitespace patterns (survives most editors) | |
| + | - **L3** rotates synonyms from a 151-class dictionary, adjusts punctuation style, spelling variants, and contractions (survives format conversion, invisible-char stripping, and screenshot/OCR) | |
| + | ||
| + | Then it encrypts to the recipient's X25519 public key, timestamps via RFC 3161, logs to the Merkle tree, and writes the `.sealed` file plus a `.fingerprint.json` sidecar for the content fingerprint database. | |
| + | ||
| + | ### What happens when you attribute | |
| + | ||
| + | The attribute command runs a 5-phase pipeline: | |
| - | **Rust port expanded from core to core+enforcement+semantics.** Three new Rust crates on top of the v0.3 core: | |
| + | 1. **Direct extraction** of L1/L2 marks from the leaked text | |
| + | 2. **Registry query** for candidate mark IDs | |
| + | 3. **L3 semantic verification** against candidates (synonym score + punctuation + spelling + contractions) | |
| + | 4. **Multi-layer Bayesian fusion** combining all evidence into ranked candidates | |
| + | 5. **Content fingerprint comparison** (winnowing + sentence hashing) as a last resort when all watermarks are stripped | |
| - | - `oversight-tlog` - RFC 6962-compliant Merkle transparency log with signed tree heads, inclusion proofs, durable append. | |
| - | - `oversight-policy` - TOCTOU-safe max_opens enforcement, jurisdiction / not_after / not_before checks, file-id sanitization. | |
| - | - `oversight-semantic` - L3 airgap-strip-survivor watermarking with the full 151-class synonym dictionary and URL/code/path/hex/base64 skip regions. | |
| + | ## What's new in v0.4.3 | |
| - | **RFC 6962 fix in Python.** The v0.2 tlog used a promote-odd-trailing shortcut that was self-consistent but not RFC 6962 compliant - inclusion proofs wouldn't verify against Sigstore tooling. Now ported to the canonical largest-power-of-2 left-heavy split. Added `verify_inclusion_proof` helper. Tested across asymmetric tree sizes. | |
| + | **Anti-stripping defenses.** ECC-protected synonym bits (R=7 repetition codes), winnowing content fingerprints, sentence-level content hashing, 25 spelling variant pairs, 30 contraction choices, number formatting marks. The VM-strip-export attack (open in airgapped VM, strip invisible chars, export clean file) is now defended by content fingerprinting. | |
| - | **Fuzz harness.** `cargo-fuzz` targets for container_parser and manifest_parser. Ready to run 24+ hours before a paid audit engagement. | |
| + | **Rich interactive CLI.** Colorful terminal interface with progress bars, panels, config management, and streamlined commands. Run `oversight init` to get started. | |
| - | **Hardware key setup guide.** `docs/HARDWARE_KEYS.md` covers YubiKey / Nitrokey / OnlyKey end-to-end - PIN/PUK setup, PIV slot provisioning, curve choice rationale, revocation, threat model, deployment checklist. | |
| + | **L3 integration.** The 151-class synonym rotation system and punctuation fingerprinting, previously implemented but not wired into the pipeline, are now fully integrated. Multi-layer Bayesian fusion combines L1, L2, and L3 evidence. | |
| - | **Everything from v0.3 is still here.** FreeTSA RFC 3161 timestamps, cross-language conformance, Python↔Rust bit-for-bit compatibility, PQ hybrid, multi-recipient sealing, registry with signed bundles. | |
| + | See `CHANGELOG.md` for full version history. | |
| ## Repository layout | ||
| @@ -46,6 +46,8 @@ from oversight_core import ( | ||
| watermark, | ||
| ) | ||
| from oversight_core.container import SealedFile | ||
| + | from oversight_core import semantic | |
| + | from oversight_core.fingerprint import ContentFingerprint | |
| # ---------------- keygen ---------------- | ||
| @@ -78,7 +80,7 @@ def cmd_seal(args): | ||
| issuer = json.loads(Path(args.issuer_key).read_text()) | ||
| rec_pub = json.loads(Path(args.recipient_pub).read_text()) | ||
| - | # Optional watermarking (text files only, MVP) | |
| + | # Optional watermarking (text files only) | |
| watermarks_for_manifest: list[WatermarkRef] = [] | ||
| if args.watermark: | ||
| try: | ||
| @@ -88,19 +90,31 @@ def cmd_seal(args): | ||
| text = None | ||
| if text is not None: | ||
| - | mark_id_zw = watermark.new_mark_id() | |
| - | mark_id_ws = watermark.new_mark_id() | |
| - | text = watermark.embed_zw(text, mark_id_zw) | |
| - | text = watermark.embed_ws(text, mark_id_ws) | |
| + | # Generate a single mark_id shared across all layers for simpler | |
| + | # attribution (one ID per recipient, not one per layer). | |
| + | mark_id = watermark.new_mark_id() | |
| + | ||
| + | # Apply layers in correct order: L3 first (rewrites words), | |
| + | # then L2 (trailing whitespace), then L1 (zero-width chars). | |
| + | # This prevents L1's invisible chars from fragmenting L3 synonym | |
| + | # words during embedding. | |
| + | text = semantic.apply_semantic(text, mark_id) | |
| + | text = watermark.embed_ws(text, mark_id) | |
| + | text = watermark.embed_zw(text, mark_id) | |
| plaintext = text.encode("utf-8") | ||
| + | ||
| + | watermarks_for_manifest.append(WatermarkRef( | |
| + | layer="L1_zero_width", mark_id=mark_id.hex() | |
| + | )) | |
| watermarks_for_manifest.append(WatermarkRef( | ||
| - | layer="L1_zero_width", mark_id=mark_id_zw.hex() | |
| + | layer="L2_whitespace", mark_id=mark_id.hex() | |
| )) | ||
| watermarks_for_manifest.append(WatermarkRef( | ||
| - | layer="L2_whitespace", mark_id=mark_id_ws.hex() | |
| + | layer="L3_semantic", mark_id=mark_id.hex() | |
| )) | ||
| - | print(f"[+] embedded L1 mark {mark_id_zw.hex()}") | |
| - | print(f"[+] embedded L2 mark {mark_id_ws.hex()}") | |
| + | print(f"[+] embedded L1 mark {mark_id.hex()}") | |
| + | print(f"[+] embedded L2 mark {mark_id.hex()}") | |
| + | print(f"[+] embedded L3 mark {mark_id.hex()} (semantic + punctuation)") | |
| # Recipient | ||
| recipient = Recipient( | ||
| @@ -129,6 +143,18 @@ def cmd_seal(args): | ||
| manifest.watermarks = watermarks_for_manifest | ||
| manifest.beacons = [b.to_dict() for b in beacons] | ||
| + | # Compute content fingerprint for the watermarked plaintext. | |
| + | # This is the per-recipient fingerprint stored server-side so we can | |
| + | # identify the source copy even if all watermarks are stripped (VM export attack). | |
| + | fingerprint = None | |
| + | try: | |
| + | fingerprint_text = plaintext.decode("utf-8") | |
| + | fingerprint = ContentFingerprint.from_text(fingerprint_text) | |
| + | print(f"[+] content fingerprint: {len(fingerprint.winnowing_fp)} winnow hashes, " | |
| + | f"{len(fingerprint.sentence_fp)} sentence hashes") | |
| + | except UnicodeDecodeError: | |
| + | pass # binary file, no fingerprint | |
| + | ||
| blob = seal( | ||
| plaintext=plaintext, | ||
| manifest=manifest, | ||
| @@ -142,6 +168,17 @@ def cmd_seal(args): | ||
| print(f"[+] recipient={recipient.recipient_id}") | ||
| print(f"[+] beacons={len(beacons)} watermarks={len(watermarks_for_manifest)}") | ||
| + | # Store fingerprint alongside the sealed file | |
| + | if fingerprint: | |
| + | fp_path = Path(args.out).with_suffix(".fingerprint.json") | |
| + | fp_path.write_text(json.dumps({ | |
| + | "file_id": manifest.file_id, | |
| + | "recipient_id": rec_pub["id"], | |
| + | "mark_id": watermarks_for_manifest[0].mark_id if watermarks_for_manifest else None, | |
| + | "fingerprint": fingerprint.to_dict(), | |
| + | }, indent=2)) | |
| + | print(f"[+] wrote fingerprint to {fp_path}") | |
| + | ||
| # Register with registry (optional) | ||
| if args.register: | ||
| reg_payload = { | ||
| @@ -149,6 +186,8 @@ def cmd_seal(args): | ||
| "beacons": [b.to_dict() for b in beacons], | ||
| "watermarks": [w.__dict__ for w in watermarks_for_manifest], | ||
| } | ||
| + | if fingerprint: | |
| + | reg_payload["fingerprint"] = fingerprint.to_dict() | |
| try: | ||
| resp = httpx.post( | ||
| f"{args.register.rstrip('/')}/register", | ||
| @@ -193,34 +232,173 @@ def cmd_inspect(args): | ||
| def cmd_attribute(args): | ||
| text = Path(args.leak).read_text(encoding="utf-8", errors="replace") | ||
| - | marks = watermark.recover_marks(text) | |
| - | print("[*] recovered marks:") | |
| - | any_found = False | |
| - | for layer, mlist in marks.items(): | |
| - | for m in mlist: | |
| - | print(f" {layer}: {m.hex()}") | |
| - | any_found = True | |
| - | if not any_found: | |
| - | print(" (none)") | |
| - | return | |
| - | ||
| - | print(f"[*] querying registry {args.registry} ...") | |
| - | for layer, mlist in marks.items(): | |
| - | for m in mlist: | |
| + | ||
| + | # Phase 1: Extract L1/L2 marks directly from text | |
| + | print("[*] Phase 1: Direct extraction (L1 + L2)") | |
| + | l1_marks = watermark.extract_zw(text) | |
| + | l2_candidate, l2_conf, l2_bits, l2_needed = watermark.extract_ws_partial(text) | |
| + | ||
| + | l1_unique = list(set(l1_marks)) | |
| + | direct_candidates: list[bytes] = list(l1_unique) | |
| + | if l2_candidate and l2_conf >= 0.5: | |
| + | if l2_candidate not in direct_candidates: | |
| + | direct_candidates.append(l2_candidate) | |
| + | ||
| + | if l1_unique: | |
| + | print(f" L1: {len(l1_marks)} frames, {len(l1_unique)} unique mark(s)") | |
| + | for m in l1_unique: | |
| + | print(f" {m.hex()}") | |
| + | else: | |
| + | print(" L1: no zero-width frames found (stripped?)") | |
| + | ||
| + | if l2_conf >= 1.0: | |
| + | print(f" L2: {l2_bits}/{l2_needed} bits recovered (100%): {l2_candidate.hex()}") | |
| + | elif l2_conf > 0: | |
| + | print(f" L2: {l2_bits}/{l2_needed} bits recovered ({l2_conf:.0%}): {l2_candidate.hex()} (partial)") | |
| + | else: | |
| + | print(" L2: no trailing whitespace marks found (stripped?)") | |
| + | ||
| + | # Phase 2: Query registry for candidate mark_ids (for L3 verification) | |
| + | registry_candidates: list[bytes] = [] | |
| + | print(f"\n[*] Phase 2: Registry query ({args.registry})") | |
| + | if direct_candidates: | |
| + | for m in direct_candidates: | |
| try: | ||
| resp = httpx.post( | ||
| f"{args.registry.rstrip('/')}/attribute", | ||
| - | json={"mark_id": m.hex(), "layer": layer}, | |
| + | json={"mark_id": m.hex(), "layer": "L1_zero_width"}, | |
| timeout=10, | ||
| ) | ||
| data = resp.json() | ||
| if data.get("found"): | ||
| - | print(f"\n[!!] ATTRIBUTION: mark {m.hex()} ({layer})") | |
| - | print(f" file_id = {data['file_id']}") | |
| - | print(f" recipient = {data['recipient_id']}") | |
| - | print(f" issuer = {data['issuer_id']}") | |
| + | print(f" MATCH: {m.hex()} -> recipient={data['recipient_id']}, " | |
| + | f"file={data['file_id']}") | |
| + | except Exception as e: | |
| + | print(f" registry query failed: {e}") | |
| + | ||
| + | # Also fetch all mark_ids for this file (for L3 verification) | |
| + | try: | |
| + | resp = httpx.get( | |
| + | f"{args.registry.rstrip('/')}/marks", | |
| + | timeout=10, | |
| + | ) | |
| + | if resp.status_code == 200: | |
| + | registry_data = resp.json() | |
| + | for entry in registry_data.get("marks", []): | |
| + | mid_bytes = bytes.fromhex(entry["mark_id"]) | |
| + | if mid_bytes not in registry_candidates: | |
| + | registry_candidates.append(mid_bytes) | |
| + | print(f" fetched {len(registry_candidates)} candidate mark_id(s) from registry") | |
| + | except Exception: | |
| + | pass # registry may not support /marks endpoint | |
| + | ||
| + | # Phase 3: L3 semantic verification against candidates | |
| + | all_candidates = direct_candidates + [ | |
| + | m for m in registry_candidates if m not in direct_candidates | |
| + | ] | |
| + | ||
| + | print(f"\n[*] Phase 3: L3 semantic verification ({len(all_candidates)} candidate(s))") | |
| + | if all_candidates: | |
| + | l3_hits = watermark.verify_l3(text, all_candidates) | |
| + | if l3_hits: | |
| + | for mid, score, detail in l3_hits: | |
| + | print(f" L3 MATCH: {mid.hex()} score={score:.2f} " | |
| + | f"(synonyms={detail['synonyms_score']:.2f}, " | |
| + | f"punct={detail['punctuation_hits']}, " | |
| + | f"dict={detail['dict_version']})") | |
| + | else: | |
| + | print(" L3: no candidates matched above threshold") | |
| + | else: | |
| + | print(" L3: no candidates available (L1/L2 stripped, registry unreachable?)") | |
| + | ||
| + | # Phase 4: Multi-layer fusion | |
| + | print("\n[*] Phase 4: Multi-layer fusion") | |
| + | result = watermark.recover_marks_v2(text, all_candidates if all_candidates else None) | |
| + | if result["candidates"]: | |
| + | for mark_id, score, layers in result["candidates"]: | |
| + | print(f" {mark_id.hex()} score={score:.3f} layers={layers}") | |
| + | best = result["candidates"][0] | |
| + | print(f"\n[!!] BEST ATTRIBUTION: {best[0].hex()}") | |
| + | print(f" confidence = {best[1]:.1%}") | |
| + | print(f" evidence = {best[2]}") | |
| + | ||
| + | # Final registry lookup for the winning candidate | |
| + | try: | |
| + | resp = httpx.post( | |
| + | f"{args.registry.rstrip('/')}/attribute", | |
| + | json={"mark_id": best[0].hex(), "layer": "fused"}, | |
| + | timeout=10, | |
| + | ) | |
| + | data = resp.json() | |
| + | if data.get("found"): | |
| + | print(f" file_id = {data['file_id']}") | |
| + | print(f" recipient = {data['recipient_id']}") | |
| + | print(f" issuer = {data['issuer_id']}") | |
| + | except Exception: | |
| + | pass | |
| + | else: | |
| + | print(" No marks recovered from any layer.") | |
| + | print("\n[*] Diagnostics:") | |
| + | for d in result["diagnostics"]: | |
| + | print(f" {d}") | |
| + | ||
| + | # Phase 5: Content fingerprint comparison (VM-strip-export defense) | |
| + | if args.fingerprints: | |
| + | print(f"\n[*] Phase 5: Content fingerprint comparison") | |
| + | leak_fp = ContentFingerprint.from_text(text) | |
| + | print(f" Leak fingerprint: {len(leak_fp.winnowing_fp)} winnow hashes, " | |
| + | f"{len(leak_fp.sentence_fp)} sentence hashes") | |
| + | ||
| + | best_fp_match = None | |
| + | best_fp_score = 0.0 | |
| + | ||
| + | fp_dir = Path(args.fingerprints) | |
| + | if fp_dir.is_dir(): | |
| + | fp_files = list(fp_dir.glob("*.fingerprint.json")) | |
| + | elif fp_dir.is_file(): | |
| + | fp_files = [fp_dir] | |
| + | else: | |
| + | fp_files = [] | |
| + | print(f" [!] fingerprint path not found: {args.fingerprints}") | |
| + | ||
| + | for fp_file in fp_files: | |
| + | try: | |
| + | fp_data = json.loads(fp_file.read_text()) | |
| + | stored_fp = ContentFingerprint.from_dict(fp_data["fingerprint"]) | |
| + | sim = leak_fp.similarity(stored_fp) | |
| + | recipient_id = fp_data.get("recipient_id", "unknown") | |
| + | mark_id = fp_data.get("mark_id", "unknown") | |
| + | ||
| + | if sim["combined"] >= 0.1: | |
| + | print(f" {fp_file.name}: recipient={recipient_id} " | |
| + | f"winnow={sim['winnowing']:.2f} " | |
| + | f"sentence={sim['sentence']:.2f} " | |
| + | f"combined={sim['combined']:.2f} " | |
| + | f"[{sim['verdict']}]") | |
| + | ||
| + | if sim["combined"] > best_fp_score: | |
| + | best_fp_score = sim["combined"] | |
| + | best_fp_match = { | |
| + | "file": fp_file.name, | |
| + | "recipient_id": recipient_id, | |
| + | "mark_id": mark_id, | |
| + | "similarity": sim, | |
| + | } | |
| except Exception as e: | ||
| - | print(f"[!] registry query failed: {e}") | |
| + | print(f" [!] error reading {fp_file.name}: {e}") | |
| + | ||
| + | if best_fp_match and best_fp_score >= 0.3: | |
| + | verdict = best_fp_match["similarity"]["verdict"] | |
| + | print(f"\n[!!] FINGERPRINT ATTRIBUTION [{verdict}]:") | |
| + | print(f" recipient = {best_fp_match['recipient_id']}") | |
| + | print(f" mark_id = {best_fp_match['mark_id']}") | |
| + | print(f" confidence = {best_fp_score:.1%}") | |
| + | print(f" winnowing = {best_fp_match['similarity']['winnowing']:.1%}") | |
| + | print(f" sentence = {best_fp_match['similarity']['sentence']:.1%}") | |
| + | elif fp_files: | |
| + | print(" No fingerprint match above threshold.") | |
| + | else: | |
| + | print(" No fingerprint files found to compare against.") | |
| # ---------------- main ---------------- | ||
| @@ -256,6 +434,8 @@ def main(): | ||
| a = sub.add_parser("attribute") | ||
| a.add_argument("--leak", required=True) | ||
| a.add_argument("--registry", required=True) | ||
| + | a.add_argument("--fingerprints", default=None, | |
| + | help="path to fingerprint file or directory for VM-strip detection") | |
| args = p.parse_args() | ||
| @@ -0,0 +1,1400 @@ | ||
| + | #!/usr/bin/env python3 | |
| + | """ | |
| + | OVERSIGHT Rich CLI -- Interactive command-line interface with rich output. | |
| + | ||
| + | Provides the `oversight` command with colorful, structured output for all | |
| + | Oversight Protocol operations: key management, sealing, opening, inspection, | |
| + | attribution, and registry interaction. | |
| + | ||
| + | Entry point: main() | |
| + | """ | |
| + | ||
| + | from __future__ import annotations | |
| + | ||
| + | import argparse | |
| + | import json | |
| + | import os | |
| + | import sys | |
| + | import time | |
| + | from pathlib import Path | |
| + | from typing import Optional | |
| + | ||
| + | # Make oversight_core importable when running from repo root | |
| + | ROOT = Path(__file__).resolve().parent.parent | |
| + | sys.path.insert(0, str(ROOT)) | |
| + | ||
| + | from rich.console import Console | |
| + | from rich.panel import Panel | |
| + | from rich.table import Table | |
| + | from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn | |
| + | from rich.prompt import Prompt, Confirm | |
| + | from rich.text import Text | |
| + | from rich.columns import Columns | |
| + | from rich.rule import Rule | |
| + | from rich import box | |
| + | ||
| + | import httpx | |
| + | ||
| + | from oversight_core import ( | |
| + | ClassicIdentity, | |
| + | Manifest, | |
| + | Recipient, | |
| + | WatermarkRef, | |
| + | content_hash, | |
| + | seal, | |
| + | open_sealed, | |
| + | beacon, | |
| + | watermark, | |
| + | __version__ as core_version, | |
| + | ) | |
| + | from oversight_core.container import SealedFile | |
| + | from oversight_core import semantic | |
| + | from oversight_core.fingerprint import ContentFingerprint | |
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Constants | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | CLI_VERSION = "0.4.1" | |
| + | CONFIG_FILENAME = "config.json" | |
| + | CONFIG_DIR_NAME = ".oversight" | |
| + | ||
| + | console = Console() | |
| + | err_console = Console(stderr=True) | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Banner | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def print_banner(): | |
| + | """Display the startup banner with version info.""" | |
| + | banner_text = Text() | |
| + | banner_text.append("OVERSIGHT", style="bold bright_white") | |
| + | banner_text.append(" PROTOCOL", style="bold cyan") | |
| + | ||
| + | version_line = Text() | |
| + | version_line.append(f"cli v{CLI_VERSION}", style="dim") | |
| + | version_line.append(" | ", style="dim") | |
| + | version_line.append(f"core v{core_version}", style="dim") | |
| + | version_line.append(" | ", style="dim") | |
| + | version_line.append("Sealed Entity, Notarized Trust", style="dim italic") | |
| + | ||
| + | combined = Text() | |
| + | combined.append(banner_text) | |
| + | combined.append("\n") | |
| + | combined.append(version_line) | |
| + | ||
| + | console.print(Panel( | |
| + | combined, | |
| + | border_style="cyan", | |
| + | padding=(0, 2), | |
| + | )) | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Config discovery and management | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def find_config_dir() -> Optional[Path]: | |
| + | """ | |
| + | Search for .oversight/ directory. Order: | |
| + | 1. Current working directory | |
| + | 2. Parent directories (up to root) | |
| + | 3. ~/.oversight/ | |
| + | Returns the path if found, None otherwise. | |
| + | """ | |
| + | # Check current and parents | |
| + | check = Path.cwd() | |
| + | while True: | |
| + | candidate = check / CONFIG_DIR_NAME | |
| + | if candidate.is_dir(): | |
| + | return candidate | |
| + | parent = check.parent | |
| + | if parent == check: | |
| + | break | |
| + | check = parent | |
| + | ||
| + | # Check home directory | |
| + | home_config = Path.home() / CONFIG_DIR_NAME | |
| + | if home_config.is_dir(): | |
| + | return home_config | |
| + | ||
| + | return None | |
| + | ||
| + | ||
| + | def load_config() -> dict: | |
| + | """Load config from discovered .oversight/ directory. Returns empty dict if not found.""" | |
| + | config_dir = find_config_dir() | |
| + | if config_dir is None: | |
| + | return {} | |
| + | config_file = config_dir / CONFIG_FILENAME | |
| + | if not config_file.exists(): | |
| + | return {"_config_dir": str(config_dir)} | |
| + | try: | |
| + | cfg = json.loads(config_file.read_text()) | |
| + | cfg["_config_dir"] = str(config_dir) | |
| + | return cfg | |
| + | except (json.JSONDecodeError, OSError) as e: | |
| + | err_console.print(f"[yellow]Warning: failed to read config: {e}[/]") | |
| + | return {"_config_dir": str(config_dir)} | |
| + | ||
| + | ||
| + | def save_config(config_dir: Path, config: dict) -> None: | |
| + | """Write config to the given .oversight/ directory.""" | |
| + | clean = {k: v for k, v in config.items() if not k.startswith("_")} | |
| + | config_file = config_dir / CONFIG_FILENAME | |
| + | config_file.write_text(json.dumps(clean, indent=2)) | |
| + | ||
| + | ||
| + | def config_dir_from_cfg(cfg: dict) -> Optional[Path]: | |
| + | """Extract the config directory path from a loaded config dict.""" | |
| + | raw = cfg.get("_config_dir") | |
| + | if raw: | |
| + | return Path(raw) | |
| + | return None | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Utility helpers | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def error_panel(message: str, suggestion: str = "") -> None: | |
| + | """Print a red error panel with optional suggestion.""" | |
| + | body = Text(message, style="bold red") | |
| + | if suggestion: | |
| + | body.append(f"\n\nSuggestion: {suggestion}", style="yellow") | |
| + | console.print(Panel(body, title="[red]Error[/]", border_style="red", padding=(0, 2))) | |
| + | ||
| + | ||
| + | def success(message: str) -> None: | |
| + | console.print(f"[green][+][/] {message}") | |
| + | ||
| + | ||
| + | def warn(message: str) -> None: | |
| + | console.print(f"[yellow][!][/] {message}") | |
| + | ||
| + | ||
| + | def info(message: str) -> None: | |
| + | console.print(f"[cyan][*][/] {message}") | |
| + | ||
| + | ||
| + | def format_hex_short(hex_str: str, length: int = 16) -> str: | |
| + | """Shorten a hex string for display.""" | |
| + | if len(hex_str) <= length: | |
| + | return hex_str | |
| + | return hex_str[:length] + "..." | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: init | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_init(args): | |
| + | """Initialize a .oversight/ directory with config.""" | |
| + | target = Path(args.path) if args.path else Path.cwd() | |
| + | config_dir = target / CONFIG_DIR_NAME | |
| + | ||
| + | if config_dir.exists() and not args.force: | |
| + | error_panel( | |
| + | f"Directory already exists: {config_dir}", | |
| + | "Use --force to reinitialize." | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | config_dir.mkdir(parents=True, exist_ok=True) | |
| + | (config_dir / "recipients").mkdir(exist_ok=True) | |
| + | (config_dir / "fingerprints").mkdir(exist_ok=True) | |
| + | ||
| + | config = { | |
| + | "issuer_identity": "", | |
| + | "registry_url": args.registry_url or "http://localhost:8000", | |
| + | "registry_domain": args.registry_domain or "oversight.example", | |
| + | "default_watermark": True, | |
| + | "content_type": "application/octet-stream", | |
| + | } | |
| + | ||
| + | save_config(config_dir, config) | |
| + | ||
| + | console.print(Panel( | |
| + | Text.assemble( | |
| + | ("Initialized .oversight/ directory\n\n", "bold green"), | |
| + | ("Location: ", ""), | |
| + | (str(config_dir), "cyan"), | |
| + | ("\n\nCreated:\n", ""), | |
| + | (" config.json ", "white"), | |
| + | ("- project configuration\n", "dim"), | |
| + | (" recipients/ ", "white"), | |
| + | ("- recipient public keys\n", "dim"), | |
| + | (" fingerprints/ ", "white"), | |
| + | ("- content fingerprints", "dim"), | |
| + | ), | |
| + | title="[green]Init Complete[/]", | |
| + | border_style="green", | |
| + | padding=(0, 2), | |
| + | )) | |
| + | ||
| + | if not config["issuer_identity"]: | |
| + | warn("No issuer identity set. Run: oversight keys generate") | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: keys generate | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_keys_generate(args): | |
| + | """Generate a new identity keypair.""" | |
| + | cfg = load_config() | |
| + | config_dir = config_dir_from_cfg(cfg) | |
| + | ||
| + | # Determine output paths | |
| + | identity_name = args.name or "identity" | |
| + | ||
| + | if args.out: | |
| + | out_path = Path(args.out) | |
| + | elif config_dir: | |
| + | out_path = config_dir / f"{identity_name}.json" | |
| + | else: | |
| + | out_path = Path(f"{identity_name}.json") | |
| + | ||
| + | if out_path.exists() and not args.force: | |
| + | error_panel( | |
| + | f"Identity file already exists: {out_path}", | |
| + | "Use --force to overwrite." | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | transient=True, | |
| + | ) as progress: | |
| + | task = progress.add_task("Generating X25519 + Ed25519 keypair...", total=None) | |
| + | ident = ClassicIdentity.generate() | |
| + | ||
| + | priv_data = { | |
| + | "id": identity_name, | |
| + | "x25519_priv": ident.x25519_priv.hex(), | |
| + | "x25519_pub": ident.x25519_pub.hex(), | |
| + | "ed25519_priv": ident.ed25519_priv.hex(), | |
| + | "ed25519_pub": ident.ed25519_pub.hex(), | |
| + | } | |
| + | ||
| + | pub_path = out_path.with_suffix(".pub.json") | |
| + | pub_data = { | |
| + | "id": identity_name, | |
| + | "x25519_pub": ident.x25519_pub.hex(), | |
| + | "ed25519_pub": ident.ed25519_pub.hex(), | |
| + | } | |
| + | ||
| + | out_path.write_text(json.dumps(priv_data, indent=2)) | |
| + | pub_path.write_text(json.dumps(pub_data, indent=2)) | |
| + | ||
| + | # Update config if we have one | |
| + | if config_dir and not args.out: | |
| + | cfg["issuer_identity"] = str(out_path) | |
| + | save_config(config_dir, cfg) | |
| + | ||
| + | table = Table(title="Generated Identity", box=box.ROUNDED, border_style="green") | |
| + | table.add_column("Field", style="cyan") | |
| + | table.add_column("Value", style="white") | |
| + | table.add_row("Name", identity_name) | |
| + | table.add_row("Private key", str(out_path)) | |
| + | table.add_row("Public key", str(pub_path)) | |
| + | table.add_row("X25519 pub", format_hex_short(ident.x25519_pub.hex(), 32)) | |
| + | table.add_row("Ed25519 pub", format_hex_short(ident.ed25519_pub.hex(), 32)) | |
| + | table.add_row("Suite", "OSGT-CLASSIC-v1 (X25519 + Ed25519)") | |
| + | console.print(table) | |
| + | success("Identity generated. Share the .pub.json file with senders.") | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: keys list | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_keys_list(args): | |
| + | """List all known identities and recipients.""" | |
| + | cfg = load_config() | |
| + | config_dir = config_dir_from_cfg(cfg) | |
| + | ||
| + | if not config_dir: | |
| + | error_panel( | |
| + | "No .oversight/ directory found.", | |
| + | "Run: oversight init" | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | # List identities (*.json but not *.pub.json in config root) | |
| + | identity_files = [ | |
| + | f for f in config_dir.glob("*.json") | |
| + | if not f.name.endswith(".pub.json") and f.name != CONFIG_FILENAME | |
| + | ] | |
| + | ||
| + | table = Table(title="Identities", box=box.ROUNDED, border_style="cyan") | |
| + | table.add_column("Name", style="bold") | |
| + | table.add_column("Ed25519 Public Key", style="white") | |
| + | table.add_column("X25519 Public Key", style="white") | |
| + | table.add_column("File", style="dim") | |
| + | ||
| + | for f in identity_files: | |
| + | try: | |
| + | data = json.loads(f.read_text()) | |
| + | is_active = str(f) == cfg.get("issuer_identity", "") | |
| + | name = data.get("id", f.stem) | |
| + | if is_active: | |
| + | name = f"[green]{name} (active)[/green]" | |
| + | table.add_row( | |
| + | name, | |
| + | format_hex_short(data.get("ed25519_pub", ""), 24), | |
| + | format_hex_short(data.get("x25519_pub", ""), 24), | |
| + | f.name, | |
| + | ) | |
| + | except (json.JSONDecodeError, OSError): | |
| + | table.add_row(f.stem, "[red]error reading[/]", "", f.name) | |
| + | ||
| + | console.print(table) | |
| + | ||
| + | # List recipients | |
| + | recipients_dir = config_dir / "recipients" | |
| + | recipient_files = list(recipients_dir.glob("*.json")) if recipients_dir.exists() else [] | |
| + | ||
| + | if recipient_files: | |
| + | rtable = Table(title="Recipients", box=box.ROUNDED, border_style="yellow") | |
| + | rtable.add_column("ID", style="bold") | |
| + | rtable.add_column("Ed25519 Public Key", style="white") | |
| + | rtable.add_column("X25519 Public Key", style="white") | |
| + | rtable.add_column("File", style="dim") | |
| + | ||
| + | for f in recipient_files: | |
| + | try: | |
| + | data = json.loads(f.read_text()) | |
| + | rtable.add_row( | |
| + | data.get("id", f.stem), | |
| + | format_hex_short(data.get("ed25519_pub", ""), 24), | |
| + | format_hex_short(data.get("x25519_pub", ""), 24), | |
| + | f.name, | |
| + | ) | |
| + | except (json.JSONDecodeError, OSError): | |
| + | rtable.add_row(f.stem, "[red]error reading[/]", "", f.name) | |
| + | ||
| + | console.print(rtable) | |
| + | else: | |
| + | info("No recipients imported. Use: oversight keys import <file>") | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: keys import | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_keys_import(args): | |
| + | """Import a recipient's public key.""" | |
| + | cfg = load_config() | |
| + | config_dir = config_dir_from_cfg(cfg) | |
| + | ||
| + | if not config_dir: | |
| + | error_panel( | |
| + | "No .oversight/ directory found.", | |
| + | "Run: oversight init" | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | source = Path(args.file) | |
| + | if not source.exists(): | |
| + | error_panel(f"File not found: {source}") | |
| + | sys.exit(1) | |
| + | ||
| + | try: | |
| + | data = json.loads(source.read_text()) | |
| + | except (json.JSONDecodeError, OSError) as e: | |
| + | error_panel(f"Failed to parse key file: {e}") | |
| + | sys.exit(1) | |
| + | ||
| + | # Validate it has the needed fields | |
| + | if "x25519_pub" not in data: | |
| + | error_panel( | |
| + | "Key file missing x25519_pub field.", | |
| + | "Ensure this is a valid Oversight public key (.pub.json)." | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | recipients_dir = config_dir / "recipients" | |
| + | recipients_dir.mkdir(exist_ok=True) | |
| + | ||
| + | name = data.get("id", source.stem) | |
| + | dest = recipients_dir / f"{name}.pub.json" | |
| + | ||
| + | if dest.exists() and not args.force: | |
| + | error_panel( | |
| + | f"Recipient already exists: {dest}", | |
| + | "Use --force to overwrite." | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | dest.write_text(json.dumps(data, indent=2)) | |
| + | success(f"Imported recipient '{name}' to {dest}") | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: seal | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_seal(args): | |
| + | """Seal a file for a recipient with full rich output.""" | |
| + | cfg = load_config() | |
| + | config_dir = config_dir_from_cfg(cfg) | |
| + | ||
| + | input_path = Path(args.input) | |
| + | if not input_path.exists(): | |
| + | error_panel(f"Input file not found: {input_path}") | |
| + | sys.exit(1) | |
| + | ||
| + | # Resolve issuer identity | |
| + | issuer_key_path = args.issuer_key | |
| + | if not issuer_key_path and cfg.get("issuer_identity"): | |
| + | issuer_key_path = cfg["issuer_identity"] | |
| + | if not issuer_key_path: | |
| + | error_panel( | |
| + | "No issuer identity specified.", | |
| + | "Use --issuer-key or set issuer_identity in config. Run: oversight keys generate" | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | issuer_key_path = Path(issuer_key_path) | |
| + | if not issuer_key_path.exists(): | |
| + | error_panel(f"Issuer key file not found: {issuer_key_path}") | |
| + | sys.exit(1) | |
| + | ||
| + | # Resolve recipient public key | |
| + | recipient_pub_path = args.to | |
| + | if not recipient_pub_path and config_dir: | |
| + | # Check recipients dir for a single recipient | |
| + | rdir = config_dir / "recipients" | |
| + | if rdir.exists(): | |
| + | rfiles = list(rdir.glob("*.json")) | |
| + | if len(rfiles) == 1: | |
| + | recipient_pub_path = str(rfiles[0]) | |
| + | ||
| + | if not recipient_pub_path: | |
| + | error_panel( | |
| + | "No recipient specified.", | |
| + | "Use --to <recipient.pub.json> or place a single key in .oversight/recipients/" | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | recipient_pub_path = Path(recipient_pub_path) | |
| + | if not recipient_pub_path.exists(): | |
| + | # Try looking in recipients dir | |
| + | if config_dir: | |
| + | candidate = config_dir / "recipients" / f"{recipient_pub_path}.pub.json" | |
| + | if candidate.exists(): | |
| + | recipient_pub_path = candidate | |
| + | else: | |
| + | candidate = config_dir / "recipients" / str(recipient_pub_path) | |
| + | if candidate.exists(): | |
| + | recipient_pub_path = candidate | |
| + | if not recipient_pub_path.exists(): | |
| + | error_panel(f"Recipient key file not found: {recipient_pub_path}") | |
| + | sys.exit(1) | |
| + | ||
| + | # Load keys | |
| + | issuer = json.loads(issuer_key_path.read_text()) | |
| + | rec_pub = json.loads(recipient_pub_path.read_text()) | |
| + | plaintext = input_path.read_bytes() | |
| + | ||
| + | # Determine output path | |
| + | out_path = Path(args.out) if args.out else input_path.with_suffix(".sealed") | |
| + | ||
| + | # Resolve settings | |
| + | registry_url = args.registry_url or cfg.get("registry_url", "http://localhost:8000") | |
| + | registry_domain = args.registry_domain or cfg.get("registry_domain", "oversight.example") | |
| + | issuer_id = args.issuer_id or issuer.get("id", "issuer") | |
| + | do_watermark = args.watermark if args.watermark is not None else cfg.get("default_watermark", True) | |
| + | content_type_val = args.content_type or cfg.get("content_type", "application/octet-stream") | |
| + | ||
| + | watermarks_for_manifest: list[WatermarkRef] = [] | |
| + | fingerprint = None | |
| + | mark_id = None | |
| + | ||
| + | # Run the seal pipeline with progress | |
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | BarColumn(bar_width=30), | |
| + | TaskProgressColumn(), | |
| + | console=console, | |
| + | ) as progress: | |
| + | total_steps = 7 if do_watermark else 4 | |
| + | task = progress.add_task("Sealing...", total=total_steps) | |
| + | ||
| + | # Step 1-3: Watermarking (if enabled) | |
| + | if do_watermark: | |
| + | try: | |
| + | text = plaintext.decode("utf-8") | |
| + | except UnicodeDecodeError: | |
| + | warn("Input is not UTF-8 text; skipping watermarks.") | |
| + | text = None | |
| + | progress.advance(task, 3) | |
| + | ||
| + | if text is not None: | |
| + | mark_id = watermark.new_mark_id() | |
| + | ||
| + | progress.update(task, description="Watermarking L3 (semantic)...") | |
| + | text = semantic.apply_semantic(text, mark_id) | |
| + | progress.advance(task) | |
| + | ||
| + | progress.update(task, description="Watermarking L2 (whitespace)...") | |
| + | text = watermark.embed_ws(text, mark_id) | |
| + | progress.advance(task) | |
| + | ||
| + | progress.update(task, description="Watermarking L1 (zero-width)...") | |
| + | text = watermark.embed_zw(text, mark_id) | |
| + | plaintext = text.encode("utf-8") | |
| + | progress.advance(task) | |
| + | ||
| + | watermarks_for_manifest = [ | |
| + | WatermarkRef(layer="L1_zero_width", mark_id=mark_id.hex()), | |
| + | WatermarkRef(layer="L2_whitespace", mark_id=mark_id.hex()), | |
| + | WatermarkRef(layer="L3_semantic", mark_id=mark_id.hex()), | |
| + | ] | |
| + | ||
| + | # Step 4: Build manifest | |
| + | progress.update(task, description="Building manifest...") | |
| + | recipient_obj = Recipient( | |
| + | recipient_id=rec_pub["id"], | |
| + | x25519_pub=rec_pub["x25519_pub"], | |
| + | ed25519_pub=rec_pub.get("ed25519_pub"), | |
| + | ) | |
| + | ||
| + | beacons = beacon.gen_beacons( | |
| + | registry_domain=registry_domain, | |
| + | file_id="pending", | |
| + | recipient_id=rec_pub["id"], | |
| + | ) | |
| + | ||
| + | manifest = Manifest.new( | |
| + | original_filename=input_path.name, | |
| + | content_hash=content_hash(plaintext), | |
| + | size_bytes=len(plaintext), | |
| + | issuer_id=issuer_id, | |
| + | issuer_ed25519_pub_hex=issuer["ed25519_pub"], | |
| + | recipient=recipient_obj, | |
| + | registry_url=registry_url, | |
| + | content_type=content_type_val, | |
| + | ) | |
| + | manifest.watermarks = watermarks_for_manifest | |
| + | manifest.beacons = [b.to_dict() for b in beacons] | |
| + | progress.advance(task) | |
| + | ||
| + | # Step 5: Compute fingerprint | |
| + | progress.update(task, description="Computing content fingerprint...") | |
| + | try: | |
| + | fp_text = plaintext.decode("utf-8") | |
| + | fingerprint = ContentFingerprint.from_text(fp_text) | |
| + | except UnicodeDecodeError: | |
| + | pass | |
| + | progress.advance(task) | |
| + | ||
| + | # Step 6: Encrypt and seal | |
| + | progress.update(task, description="Encrypting (XChaCha20-Poly1305)...") | |
| + | blob = seal( | |
| + | plaintext=plaintext, | |
| + | manifest=manifest, | |
| + | issuer_ed25519_priv=bytes.fromhex(issuer["ed25519_priv"]), | |
| + | recipient_x25519_pub=bytes.fromhex(rec_pub["x25519_pub"]), | |
| + | ) | |
| + | progress.advance(task) | |
| + | ||
| + | # Step 7: Write output | |
| + | progress.update(task, description="Writing sealed file...") | |
| + | out_path.write_bytes(blob) | |
| + | ||
| + | if fingerprint: | |
| + | fp_path = out_path.with_suffix(".fingerprint.json") | |
| + | fp_path.write_text(json.dumps({ | |
| + | "file_id": manifest.file_id, | |
| + | "recipient_id": rec_pub["id"], | |
| + | "mark_id": mark_id.hex() if mark_id else None, | |
| + | "fingerprint": fingerprint.to_dict(), | |
| + | }, indent=2)) | |
| + | ||
| + | progress.advance(task) | |
| + | ||
| + | # Summary panel | |
| + | table = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | table.add_column("Key", style="cyan") | |
| + | table.add_column("Value", style="white") | |
| + | table.add_row("Output", str(out_path)) | |
| + | table.add_row("Size", f"{len(blob):,} bytes") | |
| + | table.add_row("File ID", manifest.file_id) | |
| + | table.add_row("Issuer", issuer_id) | |
| + | table.add_row("Recipient", rec_pub["id"]) | |
| + | table.add_row("Watermarks", str(len(watermarks_for_manifest))) | |
| + | table.add_row("Beacons", str(len(beacons))) | |
| + | table.add_row("Suite", "OSGT-CLASSIC-v1") | |
| + | if mark_id: | |
| + | table.add_row("Mark ID", mark_id.hex()) | |
| + | if fingerprint: | |
| + | table.add_row("Fingerprint", f"{len(fingerprint.winnowing_fp)} winnow, {len(fingerprint.sentence_fp)} sentence hashes") | |
| + | ||
| + | console.print(Panel(table, title="[green]Sealed[/]", border_style="green", padding=(0, 1))) | |
| + | ||
| + | # Register if requested | |
| + | register_url = args.register or cfg.get("auto_register") | |
| + | if register_url: | |
| + | _do_register(register_url, manifest, beacons, watermarks_for_manifest, fingerprint) | |
| + | ||
| + | ||
| + | def _do_register(register_url: str, manifest, beacons, watermarks_for_manifest, fingerprint): | |
| + | """Register with the registry server.""" | |
| + | reg_payload = { | |
| + | "manifest": manifest.to_dict(), | |
| + | "beacons": [b.to_dict() for b in beacons], | |
| + | "watermarks": [w.__dict__ for w in watermarks_for_manifest], | |
| + | } | |
| + | if fingerprint: | |
| + | reg_payload["fingerprint"] = fingerprint.to_dict() | |
| + | try: | |
| + | resp = httpx.post( | |
| + | f"{register_url.rstrip('/')}/register", | |
| + | json=reg_payload, | |
| + | timeout=10, | |
| + | ) | |
| + | resp.raise_for_status() | |
| + | data = resp.json() | |
| + | success(f"Registered with {register_url}: tlog_index={data.get('tlog_index')}") | |
| + | except Exception as e: | |
| + | warn(f"Registry registration failed: {e}") | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: open | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_open(args): | |
| + | """Open (decrypt) a sealed file.""" | |
| + | cfg = load_config() | |
| + | ||
| + | input_path = Path(args.input) | |
| + | if not input_path.exists(): | |
| + | error_panel(f"Sealed file not found: {input_path}") | |
| + | sys.exit(1) | |
| + | ||
| + | # Resolve identity | |
| + | identity_path = args.identity | |
| + | if not identity_path and cfg.get("issuer_identity"): | |
| + | identity_path = cfg["issuer_identity"] | |
| + | if not identity_path: | |
| + | error_panel( | |
| + | "No identity specified.", | |
| + | "Use --identity <file> or set issuer_identity in config." | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | identity_path = Path(identity_path) | |
| + | if not identity_path.exists(): | |
| + | error_panel(f"Identity file not found: {identity_path}") | |
| + | sys.exit(1) | |
| + | ||
| + | out_path = Path(args.out) if args.out else input_path.with_suffix("") | |
| + | ||
| + | ident = json.loads(identity_path.read_text()) | |
| + | ||
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | transient=True, | |
| + | ) as progress: | |
| + | task = progress.add_task("Decrypting...", total=None) | |
| + | ||
| + | try: | |
| + | blob = input_path.read_bytes() | |
| + | plaintext, manifest = open_sealed( | |
| + | blob, | |
| + | recipient_x25519_priv=bytes.fromhex(ident["x25519_priv"]), | |
| + | ) | |
| + | out_path.write_bytes(plaintext) | |
| + | except ValueError as e: | |
| + | error_panel( | |
| + | f"Decryption failed: {e}", | |
| + | "Verify you are using the correct recipient identity." | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | table = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | table.add_column("Key", style="cyan") | |
| + | table.add_column("Value", style="white") | |
| + | table.add_row("Output", str(out_path)) | |
| + | table.add_row("Size", f"{len(plaintext):,} bytes") | |
| + | table.add_row("File ID", manifest.file_id) | |
| + | table.add_row("Issuer", manifest.issuer_id) | |
| + | table.add_row("Recipient", manifest.recipient.recipient_id if manifest.recipient else "unknown") | |
| + | table.add_row("Watermarks", str(len(manifest.watermarks))) | |
| + | table.add_row("Beacons", str(len(manifest.beacons))) | |
| + | ||
| + | console.print(Panel(table, title="[green]Decrypted[/]", border_style="green", padding=(0, 1))) | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: inspect | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_inspect(args): | |
| + | """Display the manifest from a sealed file without decrypting.""" | |
| + | input_path = Path(args.input) | |
| + | if not input_path.exists(): | |
| + | error_panel(f"Sealed file not found: {input_path}") | |
| + | sys.exit(1) | |
| + | ||
| + | try: | |
| + | blob = input_path.read_bytes() | |
| + | sf = SealedFile.from_bytes(blob) | |
| + | except ValueError as e: | |
| + | error_panel(f"Failed to parse sealed file: {e}") | |
| + | sys.exit(1) | |
| + | ||
| + | m = sf.manifest | |
| + | sig_valid = m.verify() | |
| + | ||
| + | # Header info | |
| + | header = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | header.add_column("Key", style="cyan") | |
| + | header.add_column("Value", style="white") | |
| + | header.add_row("File", str(input_path)) | |
| + | header.add_row("Version", m.version) | |
| + | header.add_row("Suite", m.suite) | |
| + | header.add_row("File ID", m.file_id) | |
| + | header.add_row("Issued At", time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(m.issued_at))) | |
| + | ||
| + | sig_style = "bold green" if sig_valid else "bold red" | |
| + | sig_text = "VALID" if sig_valid else "INVALID" | |
| + | header.add_row("Signature", f"[{sig_style}]{sig_text}[/]") | |
| + | ||
| + | console.print(Panel(header, title="[cyan]Manifest[/]", border_style="cyan", padding=(0, 1))) | |
| + | ||
| + | # Content info | |
| + | content = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | content.add_column("Key", style="cyan") | |
| + | content.add_column("Value", style="white") | |
| + | content.add_row("Filename", m.original_filename) | |
| + | content.add_row("Content Type", m.content_type) | |
| + | content.add_row("Size", f"{m.size_bytes:,} bytes") | |
| + | content.add_row("Content Hash", format_hex_short(m.content_hash, 32)) | |
| + | ||
| + | console.print(Panel(content, title="[cyan]Content[/]", border_style="cyan", padding=(0, 1))) | |
| + | ||
| + | # Identity info | |
| + | ident_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | ident_table.add_column("Key", style="cyan") | |
| + | ident_table.add_column("Value", style="white") | |
| + | ident_table.add_row("Issuer ID", m.issuer_id) | |
| + | ident_table.add_row("Issuer Ed25519", format_hex_short(m.issuer_ed25519_pub, 32)) | |
| + | if m.recipient: | |
| + | ident_table.add_row("Recipient ID", m.recipient.recipient_id) | |
| + | ident_table.add_row("Recipient X25519", format_hex_short(m.recipient.x25519_pub, 32)) | |
| + | ||
| + | console.print(Panel(ident_table, title="[cyan]Identities[/]", border_style="cyan", padding=(0, 1))) | |
| + | ||
| + | # Watermarks | |
| + | if m.watermarks: | |
| + | wm_table = Table(box=box.ROUNDED, border_style="yellow") | |
| + | wm_table.add_column("Layer", style="bold") | |
| + | wm_table.add_column("Mark ID", style="white") | |
| + | for w in m.watermarks: | |
| + | wm_table.add_row(w.layer, w.mark_id) | |
| + | console.print(wm_table) | |
| + | ||
| + | # Beacons | |
| + | if m.beacons: | |
| + | b_table = Table(box=box.ROUNDED, border_style="magenta") | |
| + | b_table.add_column("Kind", style="bold") | |
| + | b_table.add_column("Token ID", style="white") | |
| + | b_table.add_column("URL", style="dim") | |
| + | for b in m.beacons: | |
| + | b_table.add_row( | |
| + | b.get("kind", ""), | |
| + | format_hex_short(b.get("token_id", ""), 20), | |
| + | b.get("url", ""), | |
| + | ) | |
| + | console.print(b_table) | |
| + | ||
| + | # Policy | |
| + | if m.policy: | |
| + | p_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | p_table.add_column("Key", style="cyan") | |
| + | p_table.add_column("Value", style="white") | |
| + | for k, v in m.policy.items(): | |
| + | p_table.add_row(k, str(v)) | |
| + | console.print(Panel(p_table, title="[cyan]Policy[/]", border_style="cyan", padding=(0, 1))) | |
| + | ||
| + | # Raw JSON option | |
| + | if args.json: | |
| + | console.print(Rule("Raw Manifest JSON")) | |
| + | console.print_json(json.dumps(m.to_dict(), default=str)) | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: attribute | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_attribute(args): | |
| + | """Run full 5-phase attribution on a leaked file.""" | |
| + | cfg = load_config() | |
| + | ||
| + | leak_path = Path(args.leak) | |
| + | if not leak_path.exists(): | |
| + | error_panel(f"Leak file not found: {leak_path}") | |
| + | sys.exit(1) | |
| + | ||
| + | text = leak_path.read_text(encoding="utf-8", errors="replace") | |
| + | registry_url = args.registry or cfg.get("registry_url", "http://localhost:8000") | |
| + | ||
| + | # Resolve fingerprints path | |
| + | fingerprints_path = args.fingerprints | |
| + | if not fingerprints_path and cfg.get("_config_dir"): | |
| + | fp_dir = Path(cfg["_config_dir"]) / "fingerprints" | |
| + | if fp_dir.exists() and any(fp_dir.glob("*.fingerprint.json")): | |
| + | fingerprints_path = str(fp_dir) | |
| + | ||
| + | console.print(Rule("[bold]Attribution Analysis[/]", style="red")) | |
| + | console.print() | |
| + | ||
| + | # ===== Phase 1: Direct extraction ===== | |
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | transient=True, | |
| + | ) as progress: | |
| + | progress.add_task("Phase 1: Extracting L1 + L2 marks...", total=None) | |
| + | l1_marks = watermark.extract_zw(text) | |
| + | l2_candidate, l2_conf, l2_bits, l2_needed = watermark.extract_ws_partial(text) | |
| + | ||
| + | l1_unique = list(set(l1_marks)) | |
| + | direct_candidates: list[bytes] = list(l1_unique) | |
| + | if l2_candidate and l2_conf >= 0.5: | |
| + | if l2_candidate not in direct_candidates: | |
| + | direct_candidates.append(l2_candidate) | |
| + | ||
| + | p1_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | p1_table.add_column("Layer", style="cyan", width=8) | |
| + | p1_table.add_column("Result", style="white") | |
| + | ||
| + | if l1_unique: | |
| + | for m_id in l1_unique: | |
| + | p1_table.add_row("L1", f"[green]{m_id.hex()}[/] ({len(l1_marks)} frames, {len(l1_unique)} unique)") | |
| + | else: | |
| + | p1_table.add_row("L1", "[red]No zero-width frames found (stripped?)[/]") | |
| + | ||
| + | if l2_conf >= 1.0: | |
| + | p1_table.add_row("L2", f"[green]{l2_candidate.hex()}[/] ({l2_bits}/{l2_needed} bits, 100%)") | |
| + | elif l2_conf > 0: | |
| + | p1_table.add_row("L2", f"[yellow]{l2_candidate.hex()}[/] ({l2_bits}/{l2_needed} bits, {l2_conf:.0%} partial)") | |
| + | else: | |
| + | p1_table.add_row("L2", "[red]No trailing whitespace marks found (stripped?)[/]") | |
| + | ||
| + | console.print(Panel(p1_table, title="[bold]Phase 1: Direct Extraction[/]", border_style="cyan", padding=(0, 1))) | |
| + | ||
| + | # ===== Phase 2: Registry query ===== | |
| + | registry_candidates: list[bytes] = [] | |
| + | p2_results = [] | |
| + | ||
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | transient=True, | |
| + | ) as progress: | |
| + | progress.add_task(f"Phase 2: Querying registry at {registry_url}...", total=None) | |
| + | ||
| + | if direct_candidates: | |
| + | for m_id in direct_candidates: | |
| + | try: | |
| + | resp = httpx.post( | |
| + | f"{registry_url.rstrip('/')}/attribute", | |
| + | json={"mark_id": m_id.hex(), "layer": "L1_zero_width"}, | |
| + | timeout=10, | |
| + | ) | |
| + | data = resp.json() | |
| + | if data.get("found"): | |
| + | p2_results.append((m_id.hex(), data.get("recipient_id"), data.get("file_id"))) | |
| + | except Exception as e: | |
| + | p2_results.append((m_id.hex(), f"query failed: {e}", None)) | |
| + | ||
| + | try: | |
| + | resp = httpx.get(f"{registry_url.rstrip('/')}/marks", timeout=10) | |
| + | if resp.status_code == 200: | |
| + | registry_data = resp.json() | |
| + | for entry in registry_data.get("marks", []): | |
| + | mid_bytes = bytes.fromhex(entry["mark_id"]) | |
| + | if mid_bytes not in registry_candidates: | |
| + | registry_candidates.append(mid_bytes) | |
| + | except Exception: | |
| + | pass | |
| + | ||
| + | p2_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | p2_table.add_column("Field", style="cyan") | |
| + | p2_table.add_column("Value", style="white") | |
| + | ||
| + | if p2_results: | |
| + | for mark_hex, recipient, file_id in p2_results: | |
| + | if file_id: | |
| + | p2_table.add_row("Match", f"[green]{mark_hex}[/] -> recipient={recipient}, file={file_id}") | |
| + | else: | |
| + | p2_table.add_row("Query", f"{mark_hex}: {recipient}") | |
| + | else: | |
| + | p2_table.add_row("Result", "No direct candidates to query" if not direct_candidates else "No matches found") | |
| + | ||
| + | if registry_candidates: | |
| + | p2_table.add_row("Registry", f"Fetched {len(registry_candidates)} candidate mark_id(s)") | |
| + | ||
| + | console.print(Panel(p2_table, title="[bold]Phase 2: Registry Query[/]", border_style="cyan", padding=(0, 1))) | |
| + | ||
| + | # ===== Phase 3: L3 semantic verification ===== | |
| + | all_candidates = direct_candidates + [ | |
| + | m_id for m_id in registry_candidates if m_id not in direct_candidates | |
| + | ] | |
| + | ||
| + | p3_hits = [] | |
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | transient=True, | |
| + | ) as progress: | |
| + | progress.add_task(f"Phase 3: L3 semantic verification ({len(all_candidates)} candidates)...", total=None) | |
| + | if all_candidates: | |
| + | p3_hits = watermark.verify_l3(text, all_candidates) | |
| + | ||
| + | p3_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | p3_table.add_column("Field", style="cyan") | |
| + | p3_table.add_column("Value", style="white") | |
| + | ||
| + | if p3_hits: | |
| + | for mid, score, detail in p3_hits: | |
| + | p3_table.add_row( | |
| + | "L3 Match", | |
| + | f"[green]{mid.hex()}[/] score={score:.2f} " | |
| + | f"(synonyms={detail['synonyms_score']:.2f}, " | |
| + | f"punct={detail['punctuation_hits']}, dict={detail['dict_version']})" | |
| + | ) | |
| + | elif not all_candidates: | |
| + | p3_table.add_row("Result", "[yellow]No candidates available (L1/L2 stripped, registry unreachable?)[/]") | |
| + | else: | |
| + | p3_table.add_row("Result", f"[yellow]No candidates matched above threshold ({len(all_candidates)} tested)[/]") | |
| + | ||
| + | console.print(Panel(p3_table, title="[bold]Phase 3: Semantic Verification[/]", border_style="cyan", padding=(0, 1))) | |
| + | ||
| + | # ===== Phase 4: Multi-layer fusion ===== | |
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | transient=True, | |
| + | ) as progress: | |
| + | progress.add_task("Phase 4: Multi-layer Bayesian fusion...", total=None) | |
| + | result = watermark.recover_marks_v2(text, all_candidates if all_candidates else None) | |
| + | ||
| + | if result["candidates"]: | |
| + | fusion_table = Table(box=box.ROUNDED, border_style="green") | |
| + | fusion_table.add_column("Mark ID", style="bold white") | |
| + | fusion_table.add_column("Score", style="bold") | |
| + | fusion_table.add_column("Layers", style="cyan") | |
| + | ||
| + | for mark_id_val, score, layers in result["candidates"]: | |
| + | score_style = "green" if score >= 0.8 else "yellow" if score >= 0.5 else "red" | |
| + | fusion_table.add_row( | |
| + | mark_id_val.hex(), | |
| + | f"[{score_style}]{score:.1%}[/]", | |
| + | layers, | |
| + | ) | |
| + | ||
| + | console.print(Panel(fusion_table, title="[bold]Phase 4: Fusion Results[/]", border_style="green", padding=(0, 1))) | |
| + | ||
| + | best = result["candidates"][0] | |
| + | attribution_body = Text() | |
| + | attribution_body.append(f"Mark ID: {best[0].hex()}\n", style="bold white") | |
| + | attribution_body.append(f"Confidence: {best[1]:.1%}\n", style="bold green" if best[1] >= 0.8 else "bold yellow") | |
| + | attribution_body.append(f"Evidence: {best[2]}\n", style="cyan") | |
| + | ||
| + | # Final registry lookup | |
| + | try: | |
| + | resp = httpx.post( | |
| + | f"{registry_url.rstrip('/')}/attribute", | |
| + | json={"mark_id": best[0].hex(), "layer": "fused"}, | |
| + | timeout=10, | |
| + | ) | |
| + | data = resp.json() | |
| + | if data.get("found"): | |
| + | attribution_body.append(f"File ID: {data['file_id']}\n", style="white") | |
| + | attribution_body.append(f"Recipient: {data['recipient_id']}\n", style="bold white") | |
| + | attribution_body.append(f"Issuer: {data['issuer_id']}\n", style="white") | |
| + | except Exception: | |
| + | pass | |
| + | ||
| + | console.print(Panel( | |
| + | attribution_body, | |
| + | title="[bold red]ATTRIBUTION[/]", | |
| + | border_style="red", | |
| + | padding=(0, 2), | |
| + | )) | |
| + | else: | |
| + | console.print(Panel( | |
| + | "[red]No marks recovered from any layer.[/]", | |
| + | title="[bold]Phase 4: Fusion[/]", | |
| + | border_style="red", | |
| + | padding=(0, 2), | |
| + | )) | |
| + | if result["diagnostics"]: | |
| + | for d in result["diagnostics"]: | |
| + | info(d) | |
| + | ||
| + | # ===== Phase 5: Content fingerprint comparison ===== | |
| + | if fingerprints_path: | |
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | transient=True, | |
| + | ) as progress: | |
| + | progress.add_task("Phase 5: Content fingerprint comparison...", total=None) | |
| + | leak_fp = ContentFingerprint.from_text(text) | |
| + | ||
| + | fp_dir = Path(fingerprints_path) | |
| + | if fp_dir.is_dir(): | |
| + | fp_files = list(fp_dir.glob("*.fingerprint.json")) | |
| + | elif fp_dir.is_file(): | |
| + | fp_files = [fp_dir] | |
| + | else: | |
| + | fp_files = [] | |
| + | ||
| + | if fp_files: | |
| + | fp_table = Table(box=box.ROUNDED, border_style="magenta") | |
| + | fp_table.add_column("File", style="dim") | |
| + | fp_table.add_column("Recipient", style="bold") | |
| + | fp_table.add_column("Winnow", style="white") | |
| + | fp_table.add_column("Sentence", style="white") | |
| + | fp_table.add_column("Combined", style="bold") | |
| + | fp_table.add_column("Verdict", style="bold") | |
| + | ||
| + | best_fp_match = None | |
| + | best_fp_score = 0.0 | |
| + | ||
| + | for fp_file in fp_files: | |
| + | try: | |
| + | fp_data = json.loads(fp_file.read_text()) | |
| + | stored_fp = ContentFingerprint.from_dict(fp_data["fingerprint"]) | |
| + | sim = leak_fp.similarity(stored_fp) | |
| + | recipient_id = fp_data.get("recipient_id", "unknown") | |
| + | fp_mark_id = fp_data.get("mark_id", "unknown") | |
| + | ||
| + | if sim["combined"] >= 0.1: | |
| + | verdict_style = ( | |
| + | "green" if sim["verdict"] == "MATCH" | |
| + | else "yellow" if sim["verdict"] == "LIKELY" | |
| + | else "red" | |
| + | ) | |
| + | fp_table.add_row( | |
| + | fp_file.name, | |
| + | recipient_id, | |
| + | f"{sim['winnowing']:.2f}", | |
| + | f"{sim['sentence']:.2f}", | |
| + | f"{sim['combined']:.2f}", | |
| + | f"[{verdict_style}]{sim['verdict']}[/]", | |
| + | ) | |
| + | ||
| + | if sim["combined"] > best_fp_score: | |
| + | best_fp_score = sim["combined"] | |
| + | best_fp_match = { | |
| + | "file": fp_file.name, | |
| + | "recipient_id": recipient_id, | |
| + | "mark_id": fp_mark_id, | |
| + | "similarity": sim, | |
| + | } | |
| + | except Exception as e: | |
| + | warn(f"Error reading {fp_file.name}: {e}") | |
| + | ||
| + | console.print(Panel(fp_table, title="[bold]Phase 5: Fingerprint Comparison[/]", border_style="magenta", padding=(0, 1))) | |
| + | ||
| + | if best_fp_match and best_fp_score >= 0.3: | |
| + | fp_body = Text() | |
| + | fp_body.append(f"Verdict: {best_fp_match['similarity']['verdict']}\n", style="bold") | |
| + | fp_body.append(f"Recipient: {best_fp_match['recipient_id']}\n", style="bold white") | |
| + | fp_body.append(f"Mark ID: {best_fp_match['mark_id']}\n", style="white") | |
| + | fp_body.append(f"Confidence: {best_fp_score:.1%}\n", style="green") | |
| + | fp_body.append(f"Winnowing: {best_fp_match['similarity']['winnowing']:.1%}\n", style="dim") | |
| + | fp_body.append(f"Sentence: {best_fp_match['similarity']['sentence']:.1%}", style="dim") | |
| + | ||
| + | console.print(Panel( | |
| + | fp_body, | |
| + | title="[bold magenta]FINGERPRINT ATTRIBUTION[/]", | |
| + | border_style="magenta", | |
| + | padding=(0, 2), | |
| + | )) | |
| + | else: | |
| + | info("No fingerprint files found to compare against.") | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: status | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_status(args): | |
| + | """Show config, identity, registry health, version info.""" | |
| + | cfg = load_config() | |
| + | config_dir = config_dir_from_cfg(cfg) | |
| + | ||
| + | # Version and config panel | |
| + | status_table = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | status_table.add_column("Key", style="cyan") | |
| + | status_table.add_column("Value", style="white") | |
| + | status_table.add_row("CLI version", CLI_VERSION) | |
| + | status_table.add_row("Core version", core_version) | |
| + | ||
| + | if config_dir: | |
| + | status_table.add_row("Config dir", str(config_dir)) | |
| + | else: | |
| + | status_table.add_row("Config dir", "[yellow]not found (run: oversight init)[/]") | |
| + | ||
| + | # Config values | |
| + | if cfg.get("issuer_identity"): | |
| + | ident_path = Path(cfg["issuer_identity"]) | |
| + | if ident_path.exists(): | |
| + | try: | |
| + | ident_data = json.loads(ident_path.read_text()) | |
| + | status_table.add_row("Issuer ID", ident_data.get("id", "unknown")) | |
| + | status_table.add_row("Ed25519 pub", format_hex_short(ident_data.get("ed25519_pub", ""), 32)) | |
| + | except (json.JSONDecodeError, OSError): | |
| + | status_table.add_row("Issuer identity", f"[red]error reading {ident_path}[/]") | |
| + | else: | |
| + | status_table.add_row("Issuer identity", f"[yellow]file not found: {ident_path}[/]") | |
| + | else: | |
| + | status_table.add_row("Issuer identity", "[yellow]not configured[/]") | |
| + | ||
| + | registry_url = cfg.get("registry_url", "not configured") | |
| + | status_table.add_row("Registry URL", registry_url) | |
| + | status_table.add_row("Default watermark", str(cfg.get("default_watermark", "not set"))) | |
| + | ||
| + | # Recipients count | |
| + | if config_dir: | |
| + | rdir = config_dir / "recipients" | |
| + | rcount = len(list(rdir.glob("*.json"))) if rdir.exists() else 0 | |
| + | status_table.add_row("Recipients", str(rcount)) | |
| + | ||
| + | fdir = config_dir / "fingerprints" | |
| + | fcount = len(list(fdir.glob("*.fingerprint.json"))) if fdir.exists() else 0 | |
| + | status_table.add_row("Fingerprints", str(fcount)) | |
| + | ||
| + | console.print(Panel(status_table, title="[cyan]Oversight Status[/]", border_style="cyan", padding=(0, 1))) | |
| + | ||
| + | # Registry health check | |
| + | if registry_url and registry_url != "not configured": | |
| + | with Progress( | |
| + | SpinnerColumn(), | |
| + | TextColumn("[progress.description]{task.description}"), | |
| + | transient=True, | |
| + | ) as progress: | |
| + | progress.add_task(f"Checking registry at {registry_url}...", total=None) | |
| + | try: | |
| + | resp = httpx.get(f"{registry_url.rstrip('/')}/health", timeout=5) | |
| + | if resp.status_code == 200: | |
| + | health_data = resp.json() | |
| + | htable = Table(box=box.SIMPLE, show_header=False, padding=(0, 2)) | |
| + | htable.add_column("Key", style="cyan") | |
| + | htable.add_column("Value", style="white") | |
| + | htable.add_row("Status", f"[green]{health_data.get('status', 'ok')}[/]") | |
| + | htable.add_row("Service", health_data.get("service", "unknown")) | |
| + | htable.add_row("Version", health_data.get("version", "unknown")) | |
| + | htable.add_row("TLog size", str(health_data.get("tlog_size", 0))) | |
| + | console.print(Panel(htable, title="[green]Registry Health[/]", border_style="green", padding=(0, 1))) | |
| + | else: | |
| + | warn(f"Registry returned HTTP {resp.status_code}") | |
| + | except httpx.ConnectError: | |
| + | warn(f"Registry unreachable at {registry_url}") | |
| + | except Exception as e: | |
| + | warn(f"Registry check failed: {e}") | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Command: registry start | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def cmd_registry_start(args): | |
| + | """Start the local registry server.""" | |
| + | import subprocess | |
| + | ||
| + | host = args.host or "0.0.0.0" | |
| + | port = args.port or 8000 | |
| + | ||
| + | registry_script = ROOT / "registry" / "server.py" | |
| + | if not registry_script.exists(): | |
| + | error_panel( | |
| + | f"Registry server not found at {registry_script}", | |
| + | "Verify the Oversight installation is complete." | |
| + | ) | |
| + | sys.exit(1) | |
| + | ||
| + | info(f"Starting registry server on {host}:{port}") | |
| + | info(f"Server module: {registry_script}") | |
| + | console.print(Rule("Registry Server", style="cyan")) | |
| + | ||
| + | try: | |
| + | subprocess.run( | |
| + | [ | |
| + | sys.executable, "-m", "uvicorn", | |
| + | "registry.server:app", | |
| + | "--host", host, | |
| + | "--port", str(port), | |
| + | ], | |
| + | cwd=str(ROOT), | |
| + | ) | |
| + | except KeyboardInterrupt: | |
| + | info("Registry server stopped.") | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Argparse setup | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def build_parser() -> argparse.ArgumentParser: | |
| + | p = argparse.ArgumentParser( | |
| + | prog="oversight", | |
| + | description="Oversight Protocol CLI -- data provenance, attribution, and leak detection.", | |
| + | ) | |
| + | p.add_argument("--no-banner", action="store_true", help="suppress startup banner") | |
| + | sub = p.add_subparsers(dest="cmd") | |
| + | ||
| + | # init | |
| + | init_p = sub.add_parser("init", help="initialize .oversight/ directory") | |
| + | init_p.add_argument("--path", default=None, help="target directory (default: cwd)") | |
| + | init_p.add_argument("--registry-url", default=None, help="registry server URL") | |
| + | init_p.add_argument("--registry-domain", default=None, help="registry domain for beacons") | |
| + | init_p.add_argument("--force", action="store_true", help="overwrite existing config") | |
| + | ||
| + | # keys | |
| + | keys_p = sub.add_parser("keys", help="key management") | |
| + | keys_sub = keys_p.add_subparsers(dest="keys_cmd") | |
| + | ||
| + | kg = keys_sub.add_parser("generate", help="generate a new identity keypair") | |
| + | kg.add_argument("--name", default=None, help="identity name (default: identity)") | |
| + | kg.add_argument("--out", default=None, help="output file path") | |
| + | kg.add_argument("--force", action="store_true", help="overwrite existing identity") | |
| + | ||
| + | kl = keys_sub.add_parser("list", help="list identities and recipients") | |
| + | ||
| + | ki = keys_sub.add_parser("import", help="import a recipient public key") | |
| + | ki.add_argument("file", help="path to recipient .pub.json file") | |
| + | ki.add_argument("--force", action="store_true", help="overwrite existing recipient") | |
| + | ||
| + | # seal | |
| + | seal_p = sub.add_parser("seal", help="seal a file for a recipient") | |
| + | seal_p.add_argument("input", help="input file to seal") | |
| + | seal_p.add_argument("--to", default=None, help="recipient public key file or name") | |
| + | seal_p.add_argument("--issuer-key", default=None, help="issuer private key file") | |
| + | seal_p.add_argument("--issuer-id", default=None, help="issuer identifier") | |
| + | seal_p.add_argument("--registry-url", default=None, help="registry URL") | |
| + | seal_p.add_argument("--registry-domain", default=None, help="registry domain for beacons") | |
| + | seal_p.add_argument("--out", default=None, help="output file (default: <input>.sealed)") | |
| + | seal_p.add_argument("--content-type", default=None, help="MIME content type") | |
| + | seal_p.add_argument("--watermark", default=None, action="store_true", help="embed watermarks (default from config)") | |
| + | seal_p.add_argument("--no-watermark", dest="watermark", action="store_false", help="skip watermarks") | |
| + | seal_p.add_argument("--register", default=None, help="POST manifest to this registry URL") | |
| + | ||
| + | # open | |
| + | open_p = sub.add_parser("open", help="decrypt a sealed file") | |
| + | open_p.add_argument("input", help="sealed file to open") | |
| + | open_p.add_argument("--identity", default=None, help="recipient identity file") | |
| + | open_p.add_argument("--out", default=None, help="output file (default: strip .sealed)") | |
| + | ||
| + | # inspect | |
| + | inspect_p = sub.add_parser("inspect", help="show manifest without decrypting") | |
| + | inspect_p.add_argument("input", help="sealed file to inspect") | |
| + | inspect_p.add_argument("--json", action="store_true", help="also print raw JSON") | |
| + | ||
| + | # attribute | |
| + | attr_p = sub.add_parser("attribute", help="attribute a leaked file") | |
| + | attr_p.add_argument("leak", nargs="?", default=None, help="leaked text file") | |
| + | attr_p.add_argument("--leak", dest="leak_flag", default=None, help="leaked text file (alternative flag)") | |
| + | attr_p.add_argument("--registry", default=None, help="registry URL for lookups") | |
| + | attr_p.add_argument("--fingerprints", default=None, help="fingerprint file or directory") | |
| + | ||
| + | # status | |
| + | sub.add_parser("status", help="show config, identity, and registry health") | |
| + | ||
| + | # registry | |
| + | reg_p = sub.add_parser("registry", help="registry server management") | |
| + | reg_sub = reg_p.add_subparsers(dest="registry_cmd") | |
| + | rs = reg_sub.add_parser("start", help="start the local registry server") | |
| + | rs.add_argument("--host", default=None, help="bind host (default: 0.0.0.0)") | |
| + | rs.add_argument("--port", type=int, default=None, help="bind port (default: 8000)") | |
| + | ||
| + | return p | |
| + | ||
| + | ||
| + | # --------------------------------------------------------------------------- | |
| + | # Main entry point | |
| + | # --------------------------------------------------------------------------- | |
| + | ||
| + | def main(): | |
| + | # Handle --no-banner before argparse so it works in any position | |
| + | show_banner = "--no-banner" not in sys.argv | |
| + | argv = [a for a in sys.argv[1:] if a != "--no-banner"] | |
| + | ||
| + | parser = build_parser() | |
| + | args = parser.parse_args(argv) | |
| + | ||
| + | if not args.cmd: | |
| + | if show_banner: | |
| + | print_banner() | |
| + | parser.print_help() | |
| + | sys.exit(0) | |
| + | ||
| + | if show_banner: | |
| + | print_banner() | |
| + | ||
| + | if args.cmd == "init": | |
| + | cmd_init(args) | |
| + | ||
| + | elif args.cmd == "keys": | |
| + | if not args.keys_cmd: | |
| + | err_console.print("[red]Specify a keys subcommand: generate, list, import[/]") | |
| + | sys.exit(1) | |
| + | if args.keys_cmd == "generate": | |
| + | cmd_keys_generate(args) | |
| + | elif args.keys_cmd == "list": | |
| + | cmd_keys_list(args) | |
| + | elif args.keys_cmd == "import": | |
| + | cmd_keys_import(args) | |
| + | ||
| + | elif args.cmd == "seal": | |
| + | cmd_seal(args) | |
| + | ||
| + | elif args.cmd == "open": | |
| + | cmd_open(args) | |
| + | ||
| + | elif args.cmd == "inspect": | |
| + | cmd_inspect(args) | |
| + | ||
| + | elif args.cmd == "attribute": | |
| + | # Support both positional and --leak flag | |
| + | leak_file = args.leak or args.leak_flag | |
| + | if not leak_file: | |
| + | error_panel( | |
| + | "No leak file specified.", | |
| + | "Usage: oversight attribute <leak-file> or oversight attribute --leak <file>" | |
| + | ) | |
| + | sys.exit(1) | |
| + | args.leak = leak_file | |
| + | cmd_attribute(args) | |
| + | ||
| + | elif args.cmd == "status": | |
| + | cmd_status(args) | |
| + | ||
| + | elif args.cmd == "registry": | |
| + | if not args.registry_cmd: | |
| + | err_console.print("[red]Specify a registry subcommand: start[/]") | |
| + | sys.exit(1) | |
| + | if args.registry_cmd == "start": | |
| + | cmd_registry_start(args) | |
| + | ||
| + | else: | |
| + | parser.print_help() | |
| + | sys.exit(1) | |
| + | ||
| + | ||
| + | if __name__ == "__main__": | |
| + | main() |
| @@ -0,0 +1,190 @@ | ||
| + | """ | |
| + | oversight_core.ecc | |
| + | ================== | |
| + | ||
| + | Error-correcting codes for watermark bit protection. | |
| + | ||
| + | Implements a simple BCH-like repetition + majority-vote coding scheme that | |
| + | wraps L3 synonym bits. The goal: tolerate up to ~30% bit errors from | |
| + | paraphrasing while still recovering the mark_id payload. | |
| + | ||
| + | Scheme: (n=63, k=16, t=11) conceptual BCH replaced by a practical | |
| + | repetition-code + interleaved majority-vote design that needs no GF(2^m) | |
| + | arithmetic or external libraries. | |
| + | ||
| + | Encoding: | |
| + | 1. Take 16-bit payload (2 bytes of mark_id) | |
| + | 2. Repeat each bit R times (R=3 by default for triple-modular redundancy) | |
| + | 3. Interleave the repeated bits so adjacent errors don't cluster on one payload bit | |
| + | 4. Output 48 coded bits (16 * 3) | |
| + | ||
| + | Decoding: | |
| + | 1. De-interleave | |
| + | 2. Majority vote on each group of R bits | |
| + | 3. Recover 16-bit payload | |
| + | ||
| + | For a 64-bit mark_id, we encode 4 blocks of 16 bits = 448 coded bits total (R=7). | |
| + | With ~150 synonym classes per page, one page provides 150 coded bits (partial), | |
| + | three pages provide 450 (full coverage). | |
| + | ||
| + | Error tolerance: with R=7, corrects up to 3 errors per group. | |
| + | Effective tolerance: ~40% random bit error rate. | |
| + | With R=5, corrects 2 errors per group. Tolerance: ~35%. | |
| + | ||
| + | This is simpler than real BCH but achieves the goal without GF arithmetic. | |
| + | """ | |
| + | ||
| + | from __future__ import annotations | |
| + | ||
| + | import hashlib | |
| + | from typing import Optional | |
| + | ||
| + | ||
| + | def encode(payload: bytes, repetitions: int = 7) -> list[int]: | |
| + | """ | |
| + | Encode payload bytes into ECC-protected bit sequence. | |
| + | ||
| + | Each payload bit is repeated `repetitions` times consecutively. | |
| + | Majority vote at decode time corrects up to floor(R/2) errors per group. | |
| + | ||
| + | Args: | |
| + | payload: raw bytes to protect (typically 8-byte mark_id) | |
| + | repetitions: odd number of times each bit is repeated (default 7) | |
| + | ||
| + | Returns: | |
| + | list of coded bits (len = len(payload) * 8 * repetitions) | |
| + | """ | |
| + | coded = [] | |
| + | for byte in payload: | |
| + | for i in range(8): | |
| + | bit = (byte >> (7 - i)) & 1 | |
| + | coded.extend([bit] * repetitions) | |
| + | return coded | |
| + | ||
| + | ||
| + | def decode( | |
| + | coded_bits: list[int], | |
| + | payload_len: int = 8, | |
| + | repetitions: int = 7, | |
| + | ) -> tuple[bytes, float, int]: | |
| + | """ | |
| + | Decode ECC-protected bits back to payload via majority vote. | |
| + | ||
| + | Args: | |
| + | coded_bits: received bits (may contain errors) | |
| + | payload_len: expected payload length in bytes | |
| + | repetitions: repetition factor used during encoding | |
| + | ||
| + | Returns: | |
| + | (recovered_payload, confidence, errors_corrected) | |
| + | ||
| + | confidence = fraction of groups where majority was unanimous | |
| + | errors_corrected = number of groups where at least one bit disagreed | |
| + | """ | |
| + | n_payload_bits = payload_len * 8 | |
| + | expected_coded = n_payload_bits * repetitions | |
| + | ||
| + | # Pad or truncate to expected length | |
| + | if len(coded_bits) < expected_coded: | |
| + | coded_bits = coded_bits + [0] * (expected_coded - len(coded_bits)) | |
| + | coded_bits = coded_bits[:expected_coded] | |
| + | ||
| + | # Majority vote per group of `repetitions` consecutive bits | |
| + | recovered_bits = [] | |
| + | errors = 0 | |
| + | unanimous = 0 | |
| + | for g in range(n_payload_bits): | |
| + | group = coded_bits[g * repetitions : (g + 1) * repetitions] | |
| + | ones = sum(group) | |
| + | zeros = len(group) - ones | |
| + | if ones > zeros: | |
| + | recovered_bits.append(1) | |
| + | else: | |
| + | recovered_bits.append(0) | |
| + | if ones != 0 and zeros != 0: | |
| + | errors += 1 | |
| + | else: | |
| + | unanimous += 1 | |
| + | ||
| + | # Convert bits to bytes | |
| + | out = bytearray() | |
| + | for i in range(0, len(recovered_bits), 8): | |
| + | byte = 0 | |
| + | for j in range(8): | |
| + | if i + j < len(recovered_bits): | |
| + | byte = (byte << 1) | recovered_bits[i + j] | |
| + | else: | |
| + | byte = byte << 1 | |
| + | out.append(byte) | |
| + | ||
| + | confidence = unanimous / n_payload_bits if n_payload_bits else 0.0 | |
| + | return bytes(out), confidence, errors | |
| + | ||
| + | ||
| + | def mark_id_to_ecc_bits(mark_id: bytes, repetitions: int = 3) -> list[int]: | |
| + | """Convenience: encode a mark_id into ECC-protected bits.""" | |
| + | return encode(mark_id, repetitions) | |
| + | ||
| + | ||
| + | def ecc_bits_to_mark_id( | |
| + | bits: list[int], | |
| + | mark_len: int = 8, | |
| + | repetitions: int = 3, | |
| + | ) -> tuple[bytes, float, int]: | |
| + | """Convenience: decode ECC bits back to mark_id with error stats.""" | |
| + | return decode(bits, mark_len, repetitions) | |
| + | ||
| + | ||
| + | def verify_with_ecc( | |
| + | observed_variant_indices: list[int], | |
| + | candidate_mark_id: bytes, | |
| + | class_size: int = 3, | |
| + | repetitions: int = 3, | |
| + | ) -> tuple[bool, float, bytes]: | |
| + | """ | |
| + | Verify a candidate mark_id against observed synonym choices using ECC. | |
| + | ||
| + | Instead of the old threshold-based matching, this: | |
| + | 1. Encodes the candidate mark_id into ECC bits | |
| + | 2. Maps the candidate's expected variant sequence | |
| + | 3. Compares expected vs observed, producing received bits | |
| + | 4. Decodes via ECC majority vote | |
| + | 5. Checks if decoded payload matches candidate | |
| + | ||
| + | Returns: | |
| + | (match, confidence, decoded_mark_id) | |
| + | """ | |
| + | from .semantic import _mark_id_to_variant_sequence | |
| + | ||
| + | n_instances = len(observed_variant_indices) | |
| + | if n_instances == 0: | |
| + | return False, 0.0, b"" | |
| + | ||
| + | # What variant sequence would this mark_id produce? | |
| + | expected_variants = _mark_id_to_variant_sequence( | |
| + | candidate_mark_id, n_instances, class_size | |
| + | ) | |
| + | ||
| + | # Convert observed variants to bits: does each match the expected? | |
| + | # 1 = match, 0 = mismatch | |
| + | received_bits = [] | |
| + | for obs, exp in zip(observed_variant_indices, expected_variants): | |
| + | obs_mod = obs % class_size | |
| + | exp_mod = exp % class_size | |
| + | received_bits.append(1 if obs_mod == exp_mod else 0) | |
| + | ||
| + | # The received_bits represent the coded signal through a noisy channel. | |
| + | # If ECC was used during embedding, we can decode. | |
| + | # If not (legacy), fall back to simple ratio. | |
| + | match_ratio = sum(received_bits) / len(received_bits) if received_bits else 0.0 | |
| + | ||
| + | # For ECC-encoded marks, try to decode | |
| + | if len(received_bits) >= len(candidate_mark_id) * 8 * repetitions: | |
| + | decoded, confidence, errors = decode( | |
| + | received_bits, len(candidate_mark_id), repetitions | |
| + | ) | |
| + | match = (decoded == candidate_mark_id) and confidence >= 0.5 | |
| + | return match, confidence, decoded | |
| + | ||
| + | # Fallback: simple ratio matching for short texts or non-ECC marks | |
| + | return match_ratio >= 0.70, match_ratio, candidate_mark_id |
| @@ -0,0 +1,217 @@ | ||
| + | """ | |
| + | oversight_core.fingerprint | |
| + | ========================== | |
| + | ||
| + | Content fingerprinting for leak detection when watermarks are stripped. | |
| + | ||
| + | Two independent fingerprinting methods: | |
| + | ||
| + | 1. Winnowing (Schleimer, Wilkerson, Aiken, SIGMOD 2003) | |
| + | Computes rolling hash fingerprints over k-grams of the text. | |
| + | Selects a subset via the winnowing algorithm (minimum hash in each window). | |
| + | Enables partial-copy detection for near-verbatim leaks. | |
| + | Does NOT survive paraphrasing. | |
| + | ||
| + | 2. Semantic sentence hashing | |
| + | Hashes normalized, lemmatized sentences. More robust than winnowing | |
| + | to minor word changes because it operates on content words only. | |
| + | Survives format conversion, minor edits, whitespace changes. | |
| + | Does NOT survive heavy paraphrasing. | |
| + | ||
| + | Both methods produce fingerprints stored at seal time (in the manifest or | |
| + | registry). At attribution time, fingerprints of the leaked text are compared | |
| + | against stored fingerprints to identify which recipient's copy was leaked. | |
| + | ||
| + | The fingerprint DB is NOT a watermark. It is a server-side identification | |
| + | system. The fingerprints never appear in the document itself. An adversary | |
| + | cannot strip what is not embedded. | |
| + | """ | |
| + | ||
| + | from __future__ import annotations | |
| + | ||
| + | import hashlib | |
| + | import re | |
| + | from typing import Optional | |
| + | ||
| + | ||
| + | # ---- Text normalization ---- | |
| + | ||
| + | def _normalize_text(text: str) -> str: | |
| + | """Normalize text for fingerprinting: lowercase, collapse whitespace, strip punctuation.""" | |
| + | text = text.lower() | |
| + | # Remove zero-width chars | |
| + | for ch in "\u200b\u200c\u200d\ufeff": | |
| + | text = text.replace(ch, "") | |
| + | # Collapse whitespace | |
| + | text = re.sub(r"\s+", " ", text) | |
| + | # Strip non-alphanumeric except spaces | |
| + | text = re.sub(r"[^a-z0-9 ]", "", text) | |
| + | return text.strip() | |
| + | ||
| + | ||
| + | def _sentences(text: str) -> list[str]: | |
| + | """Split text into sentences using simple heuristics.""" | |
| + | # Split on sentence-ending punctuation followed by space or EOL | |
| + | parts = re.split(r"(?<=[.!?])\s+", text) | |
| + | return [s.strip() for s in parts if s.strip()] | |
| + | ||
| + | ||
| + | # ---- Winnowing ---- | |
| + | ||
| + | def _rolling_hash(text: str, k: int) -> list[tuple[int, int]]: | |
| + | """Compute rolling hashes for all k-grams. Returns (hash, position) pairs.""" | |
| + | if len(text) < k: | |
| + | return [] | |
| + | hashes = [] | |
| + | for i in range(len(text) - k + 1): | |
| + | kgram = text[i : i + k] | |
| + | h = int(hashlib.md5(kgram.encode(), usedforsecurity=False).hexdigest()[:8], 16) | |
| + | hashes.append((h, i)) | |
| + | return hashes | |
| + | ||
| + | ||
| + | def winnow(text: str, k: int = 10, window: int = 4) -> list[int]: | |
| + | """ | |
| + | Winnowing algorithm for document fingerprinting. | |
| + | ||
| + | Args: | |
| + | text: input text (will be normalized) | |
| + | k: k-gram size (character-level) | |
| + | window: winnowing window size | |
| + | ||
| + | Returns: | |
| + | sorted list of selected hash values (the fingerprint) | |
| + | """ | |
| + | normalized = _normalize_text(text) | |
| + | if len(normalized) < k: | |
| + | return [] | |
| + | ||
| + | hashes = _rolling_hash(normalized, k) | |
| + | if len(hashes) < window: | |
| + | return [h for h, _ in hashes] | |
| + | ||
| + | selected = set() | |
| + | prev_min_idx = -1 | |
| + | ||
| + | for i in range(len(hashes) - window + 1): | |
| + | window_hashes = hashes[i : i + window] | |
| + | # Select rightmost minimum in window | |
| + | min_h = min(h for h, _ in window_hashes) | |
| + | # Find rightmost occurrence of min | |
| + | for j in range(len(window_hashes) - 1, -1, -1): | |
| + | if window_hashes[j][0] == min_h: | |
| + | abs_idx = i + j | |
| + | if abs_idx != prev_min_idx: | |
| + | selected.add(window_hashes[j][0]) | |
| + | prev_min_idx = abs_idx | |
| + | break | |
| + | ||
| + | return sorted(selected) | |
| + | ||
| + | ||
| + | def winnow_similarity(fp1: list[int], fp2: list[int]) -> float: | |
| + | """Jaccard similarity between two winnowing fingerprints.""" | |
| + | if not fp1 or not fp2: | |
| + | return 0.0 | |
| + | s1 = set(fp1) | |
| + | s2 = set(fp2) | |
| + | intersection = len(s1 & s2) | |
| + | union = len(s1 | s2) | |
| + | return intersection / union if union > 0 else 0.0 | |
| + | ||
| + | ||
| + | # ---- Semantic sentence hashing ---- | |
| + | ||
| + | def _sentence_hash(sentence: str) -> str: | |
| + | """Hash a normalized sentence. Returns hex string.""" | |
| + | normalized = _normalize_text(sentence) | |
| + | # Extract content words only (skip 1-2 char words) | |
| + | words = [w for w in normalized.split() if len(w) > 2] | |
| + | content = " ".join(sorted(words)) # sort for order-independence | |
| + | return hashlib.sha256(content.encode()).hexdigest()[:16] | |
| + | ||
| + | ||
| + | def sentence_fingerprint(text: str) -> list[str]: | |
| + | """ | |
| + | Compute per-sentence content hashes. | |
| + | ||
| + | Returns list of 16-char hex hashes, one per sentence. | |
| + | Order-independent within each sentence (sorted words) so minor | |
| + | word reordering does not change the hash. | |
| + | """ | |
| + | sents = _sentences(text) | |
| + | return [_sentence_hash(s) for s in sents if len(s.split()) >= 3] | |
| + | ||
| + | ||
| + | def sentence_similarity(fp1: list[str], fp2: list[str]) -> float: | |
| + | """Fraction of sentence hashes in fp2 that appear in fp1.""" | |
| + | if not fp1 or not fp2: | |
| + | return 0.0 | |
| + | s1 = set(fp1) | |
| + | matches = sum(1 for h in fp2 if h in s1) | |
| + | return matches / len(fp2) | |
| + | ||
| + | ||
| + | # ---- Combined fingerprint ---- | |
| + | ||
| + | class ContentFingerprint: | |
| + | """Combined winnowing + sentence fingerprint for a document.""" | |
| + | ||
| + | def __init__( | |
| + | self, | |
| + | winnowing_fp: list[int], | |
| + | sentence_fp: list[str], | |
| + | text_length: int, | |
| + | sentence_count: int, | |
| + | ): | |
| + | self.winnowing_fp = winnowing_fp | |
| + | self.sentence_fp = sentence_fp | |
| + | self.text_length = text_length | |
| + | self.sentence_count = sentence_count | |
| + | ||
| + | @classmethod | |
| + | def from_text(cls, text: str, k: int = 10, window: int = 4) -> "ContentFingerprint": | |
| + | """Create a fingerprint from text.""" | |
| + | return cls( | |
| + | winnowing_fp=winnow(text, k, window), | |
| + | sentence_fp=sentence_fingerprint(text), | |
| + | text_length=len(text), | |
| + | sentence_count=len(_sentences(text)), | |
| + | ) | |
| + | ||
| + | def similarity(self, other: "ContentFingerprint") -> dict: | |
| + | """Compare this fingerprint against another. Returns per-method scores.""" | |
| + | w_sim = winnow_similarity(self.winnowing_fp, other.winnowing_fp) | |
| + | s_sim = sentence_similarity(self.sentence_fp, other.sentence_fp) | |
| + | # Combined: weighted average (winnowing is stricter, sentence is more robust) | |
| + | combined = 0.4 * w_sim + 0.6 * s_sim | |
| + | return { | |
| + | "winnowing": w_sim, | |
| + | "sentence": s_sim, | |
| + | "combined": combined, | |
| + | "verdict": ( | |
| + | "MATCH" if combined >= 0.6 | |
| + | else "LIKELY" if combined >= 0.3 | |
| + | else "UNLIKELY" if combined >= 0.1 | |
| + | else "NO_MATCH" | |
| + | ), | |
| + | } | |
| + | ||
| + | def to_dict(self) -> dict: | |
| + | """Serialize for storage in manifest/registry.""" | |
| + | return { | |
| + | "winnowing_fp": self.winnowing_fp, | |
| + | "sentence_fp": self.sentence_fp, | |
| + | "text_length": self.text_length, | |
| + | "sentence_count": self.sentence_count, | |
| + | } | |
| + | ||
| + | @classmethod | |
| + | def from_dict(cls, d: dict) -> "ContentFingerprint": | |
| + | """Deserialize from stored dict.""" | |
| + | return cls( | |
| + | winnowing_fp=d["winnowing_fp"], | |
| + | sentence_fp=d["sentence_fp"], | |
| + | text_length=d.get("text_length", 0), | |
| + | sentence_count=d.get("sentence_count", 0), | |
| + | ) |
| @@ -376,6 +376,189 @@ def extract_punctuation_bits(text: str) -> list[int]: | ||
| return bits | ||
| + | # ------------------------------------------------------------------ | |
| + | # T2b -- Spelling variants (British/American) | |
| + | # ------------------------------------------------------------------ | |
| + | ||
| + | # Each pair: (American, British). Bit 0 = American, Bit 1 = British. | |
| + | SPELLING_VARIANTS = [ | |
| + | ("color", "colour"), | |
| + | ("favor", "favour"), | |
| + | ("honor", "honour"), | |
| + | ("humor", "humour"), | |
| + | ("labor", "labour"), | |
| + | ("neighbor", "neighbour"), | |
| + | ("behavior", "behaviour"), | |
| + | ("organization", "organisation"), | |
| + | ("realize", "realise"), | |
| + | ("analyze", "analyse"), | |
| + | ("optimize", "optimise"), | |
| + | ("authorize", "authorise"), | |
| + | ("recognize", "recognise"), | |
| + | ("customize", "customise"), | |
| + | ("minimize", "minimise"), | |
| + | ("maximize", "maximise"), | |
| + | ("defense", "defence"), | |
| + | ("offense", "offence"), | |
| + | ("license", "licence"), | |
| + | ("catalog", "catalogue"), | |
| + | ("program", "programme"), | |
| + | ("center", "centre"), | |
| + | ("meter", "metre"), | |
| + | ("fiber", "fibre"), | |
| + | ("theater", "theatre"), | |
| + | ] | |
| + | ||
| + | # Build lookup for extraction | |
| + | _SPELLING_LOOKUP: dict[str, tuple[int, int]] = {} | |
| + | for _si, (_am, _br) in enumerate(SPELLING_VARIANTS): | |
| + | _SPELLING_LOOKUP[_am.lower()] = (_si, 0) | |
| + | _SPELLING_LOOKUP[_br.lower()] = (_si, 1) | |
| + | ||
| + | ||
| + | def embed_spelling(text: str, mark_id: bytes) -> str: | |
| + | """Apply spelling variant marks keyed to mark_id bits.""" | |
| + | for si, (american, british) in enumerate(SPELLING_VARIANTS): | |
| + | bit = _bit_for(mark_id, si + 8) # offset by 8 to avoid collision with T2 punct | |
| + | target = british if bit else american | |
| + | other = american if bit else british | |
| + | # Case-insensitive replacement preserving case | |
| + | pattern = re.compile(re.escape(other), re.IGNORECASE) | |
| + | text = pattern.sub(lambda m: _case_preserve(target, m.group()), text) | |
| + | return text | |
| + | ||
| + | ||
| + | def extract_spelling_bits(text: str) -> list[tuple[int, int]]: | |
| + | """ | |
| + | Extract spelling variant bits from text. | |
| + | Returns list of (variant_index, bit_value) tuples. | |
| + | """ | |
| + | found = [] | |
| + | for m in _WORD_RE.finditer(text): | |
| + | key = m.group(1).lower() | |
| + | if key in _SPELLING_LOOKUP: | |
| + | si, bit = _SPELLING_LOOKUP[key] | |
| + | found.append((si, bit)) | |
| + | return found | |
| + | ||
| + | ||
| + | # ------------------------------------------------------------------ | |
| + | # T2c -- Contraction expansion/collapse | |
| + | # ------------------------------------------------------------------ | |
| + | ||
| + | CONTRACTIONS = [ | |
| + | ("don't", "do not"), | |
| + | ("doesn't", "does not"), | |
| + | ("didn't", "did not"), | |
| + | ("won't", "will not"), | |
| + | ("wouldn't", "would not"), | |
| + | ("shouldn't", "should not"), | |
| + | ("couldn't", "could not"), | |
| + | ("isn't", "is not"), | |
| + | ("aren't", "are not"), | |
| + | ("wasn't", "was not"), | |
| + | ("weren't", "were not"), | |
| + | ("hasn't", "has not"), | |
| + | ("haven't", "have not"), | |
| + | ("hadn't", "had not"), | |
| + | ("can't", "cannot"), | |
| + | ("it's", "it is"), | |
| + | ("that's", "that is"), | |
| + | ("there's", "there is"), | |
| + | ("they're", "they are"), | |
| + | ("we're", "we are"), | |
| + | ("you're", "you are"), | |
| + | ("I'm", "I am"), | |
| + | ("he's", "he is"), | |
| + | ("she's", "she is"), | |
| + | ("we've", "we have"), | |
| + | ("they've", "they have"), | |
| + | ("I've", "I have"), | |
| + | ("you've", "you have"), | |
| + | ("we'll", "we will"), | |
| + | ("they'll", "they will"), | |
| + | ] | |
| + | ||
| + | ||
| + | def embed_contractions(text: str, mark_id: bytes) -> str: | |
| + | """ | |
| + | Expand or contract eligible contractions based on mark_id bits. | |
| + | Bit 0 = contracted form, Bit 1 = expanded form. | |
| + | """ | |
| + | for ci, (contracted, expanded) in enumerate(CONTRACTIONS): | |
| + | bit = _bit_for(mark_id, ci + 40) # offset past spelling bits | |
| + | if bit: | |
| + | # Expand: replace contracted with expanded | |
| + | pattern = re.compile(re.escape(contracted), re.IGNORECASE) | |
| + | text = pattern.sub( | |
| + | lambda m: _case_preserve(expanded, m.group()), text | |
| + | ) | |
| + | else: | |
| + | # Contract: replace expanded with contracted | |
| + | pattern = re.compile(re.escape(expanded), re.IGNORECASE) | |
| + | text = pattern.sub( | |
| + | lambda m: _case_preserve(contracted, m.group()), text | |
| + | ) | |
| + | return text | |
| + | ||
| + | ||
| + | def extract_contraction_bits(text: str) -> list[tuple[int, int]]: | |
| + | """ | |
| + | Detect which form (contracted vs expanded) appears in text. | |
| + | Returns list of (contraction_index, bit_value). | |
| + | """ | |
| + | found = [] | |
| + | text_lower = text.lower() | |
| + | for ci, (contracted, expanded) in enumerate(CONTRACTIONS): | |
| + | has_contracted = contracted.lower() in text_lower | |
| + | has_expanded = expanded.lower() in text_lower | |
| + | if has_contracted and not has_expanded: | |
| + | found.append((ci, 0)) | |
| + | elif has_expanded and not has_contracted: | |
| + | found.append((ci, 1)) | |
| + | return found | |
| + | ||
| + | ||
| + | # ------------------------------------------------------------------ | |
| + | # T2d -- Number formatting | |
| + | # ------------------------------------------------------------------ | |
| + | ||
| + | def embed_number_format(text: str, mark_id: bytes) -> str: | |
| + | """ | |
| + | Apply number formatting choices keyed to mark_id. | |
| + | Bit 0: "1,000" vs "1000" (comma separator) | |
| + | Bit 1: "50%" vs "50 percent" / "50 per cent" | |
| + | """ | |
| + | b0 = _bit_for(mark_id, 72) | |
| + | b1 = _bit_for(mark_id, 73) | |
| + | ||
| + | if b0: | |
| + | # Add comma separators to numbers >= 1000 | |
| + | def _add_commas(m): | |
| + | n = m.group() | |
| + | if len(n) >= 4 and "," not in n: | |
| + | parts = [] | |
| + | while len(n) > 3: | |
| + | parts.append(n[-3:]) | |
| + | n = n[:-3] | |
| + | parts.append(n) | |
| + | return ",".join(reversed(parts)) | |
| + | return m.group() | |
| + | text = re.sub(r"\b\d{4,}\b", _add_commas, text) | |
| + | else: | |
| + | # Remove comma separators | |
| + | text = re.sub(r"(\d),(\d{3})", r"\1\2", text) | |
| + | ||
| + | if b1: | |
| + | # Use "percent" word form | |
| + | text = re.sub(r"(\d+)\s*%", r"\1 percent", text) | |
| + | else: | |
| + | # Use % symbol | |
| + | text = re.sub(r"(\d+)\s+percent\b", r"\1%", text, flags=re.IGNORECASE) | |
| + | ||
| + | return text | |
| + | ||
| + | ||
| # ------------------------------------------------------------------ | ||
| # Combined L3 API | ||
| # ------------------------------------------------------------------ | ||
| @@ -461,21 +644,34 @@ def verify_synonyms_v2( | ||
| def apply_semantic(text: str, mark_id: bytes, use_v2: bool = True) -> str: | ||
| - | """Apply all L3 layers: synonyms (v2 by default) + punctuation.""" | |
| + | """ | |
| + | Apply all L3 layers: synonyms + punctuation + spelling + contractions + numbers. | |
| + | ||
| + | This is the full semantic watermark embedding. Every mark type survives | |
| + | format conversion and invisible-character stripping. | |
| + | """ | |
| if use_v2 and SYNONYMS_V2_AVAILABLE: | ||
| t = embed_synonyms_v2(text, mark_id) | ||
| else: | ||
| t = embed_synonyms(text, mark_id) | ||
| t = embed_punctuation(t, mark_id) | ||
| + | t = embed_spelling(t, mark_id) | |
| + | t = embed_contractions(t, mark_id) | |
| + | t = embed_number_format(t, mark_id) | |
| return t | ||
| def verify_semantic(text: str, candidate_mark_id: bytes, use_v2: bool = True) -> dict: | ||
| - | """Check whether text matches candidate_mark_id. Returns per-sublayer scores.""" | |
| + | """ | |
| + | Check whether text matches candidate_mark_id across all semantic sublayers. | |
| + | Returns per-sublayer scores and an overall match verdict. | |
| + | """ | |
| if use_v2 and SYNONYMS_V2_AVAILABLE: | ||
| syn_match, syn_score = verify_synonyms_v2(text, candidate_mark_id) | ||
| else: | ||
| syn_match, syn_score = verify_synonyms_match(text, candidate_mark_id) | ||
| + | ||
| + | # Punctuation | |
| punct_bits = extract_punctuation_bits(text) | ||
| expected_punct = [ | ||
| _bit_for(candidate_mark_id, 0), | ||
| @@ -486,11 +682,64 @@ def verify_semantic(text: str, candidate_mark_id: bytes, use_v2: bool = True) -> | ||
| punct_total = len(punct_bits) | ||
| punct_score = punct_hits / punct_total if punct_total else 0.0 | ||
| + | # Spelling variants | |
| + | spelling_bits = extract_spelling_bits(text) | |
| + | spelling_hits = 0 | |
| + | spelling_total = len(spelling_bits) | |
| + | for si, actual_bit in spelling_bits: | |
| + | expected_bit = _bit_for(candidate_mark_id, si + 8) | |
| + | if actual_bit == expected_bit: | |
| + | spelling_hits += 1 | |
| + | spelling_score = spelling_hits / spelling_total if spelling_total else 0.0 | |
| + | ||
| + | # Contractions | |
| + | contraction_bits = extract_contraction_bits(text) | |
| + | contraction_hits = 0 | |
| + | contraction_total = len(contraction_bits) | |
| + | for ci, actual_bit in contraction_bits: | |
| + | expected_bit = _bit_for(candidate_mark_id, ci + 40) | |
| + | if actual_bit == expected_bit: | |
| + | contraction_hits += 1 | |
| + | contraction_score = ( | |
| + | contraction_hits / contraction_total if contraction_total else 0.0 | |
| + | ) | |
| + | ||
| + | # Weighted overall: synonyms are the primary signal, others are supplementary | |
| + | weights = {"syn": 0.50, "punct": 0.10, "spell": 0.20, "contract": 0.20} | |
| + | scores = { | |
| + | "syn": syn_score, | |
| + | "punct": punct_score, | |
| + | "spell": spelling_score, | |
| + | "contract": contraction_score, | |
| + | } | |
| + | active_weight = sum( | |
| + | w for k, w in weights.items() | |
| + | if (k == "syn" or (k == "punct" and punct_total) | |
| + | or (k == "spell" and spelling_total) | |
| + | or (k == "contract" and contraction_total)) | |
| + | ) | |
| + | if active_weight > 0: | |
| + | weighted_score = sum( | |
| + | scores[k] * weights[k] for k in weights | |
| + | if (k == "syn" or (k == "punct" and punct_total) | |
| + | or (k == "spell" and spelling_total) | |
| + | or (k == "contract" and contraction_total)) | |
| + | ) / active_weight | |
| + | else: | |
| + | weighted_score = syn_score | |
| + | ||
| + | overall_match = weighted_score >= 0.65 | |
| + | ||
| return { | ||
| "synonyms_match": syn_match, | ||
| "synonyms_score": syn_score, | ||
| "punctuation_score": punct_score, | ||
| "punctuation_hits": f"{punct_hits}/{punct_total}", | ||
| - | "overall_match": syn_match and (punct_score >= 0.5 if punct_total else True), | |
| + | "spelling_score": spelling_score, | |
| + | "spelling_hits": f"{spelling_hits}/{spelling_total}", | |
| + | "contraction_score": contraction_score, | |
| + | "contraction_hits": f"{contraction_hits}/{contraction_total}", | |
| + | "weighted_score": weighted_score, | |
| + | "overall_match": overall_match, | |
| "dict_version": "v2" if (use_v2 and SYNONYMS_V2_AVAILABLE) else "v1", | ||
| } |
| @@ -20,11 +20,12 @@ Layers: | ||
| cleaning than L1 because linters often don't touch trailing whitespace in | ||
| content-bearing fields. | ||
| - | L3 (synonym rotation, stub): | |
| - | Placeholder for semantic watermarking - swap between {start/begin/commence} | |
| - | style synonym classes per-bit. Survives format conversion completely because | |
| - | the mark is in the *words chosen*. Real implementation needs an NLP pass; | |
| - | the stub here demonstrates the hook. | |
| + | L3 (synonym rotation + punctuation): | |
| + | Semantic watermarking via synonym-class rotation (151 classes in v2) and | |
| + | punctuation-style fingerprinting (Oxford comma, em dash, curly quotes). | |
| + | Survives format conversion, invisible-char stripping, and whitespace | |
| + | normalization because the marks are in the words and punctuation chosen. | |
| + | Implementation in oversight_core.semantic; wired in here via apply_all. | |
| Future (not in MVP): | ||
| - Visual DCT-domain watermarks for images (robust to recompression + screenshot) | ||
| @@ -161,38 +162,70 @@ def extract_ws(text: str, mark_len_bytes: int = 8) -> Optional[bytes]: | ||
| return _bytes_from_bits(bits[:needed]) | ||
| - | # ---------------- L3: synonym-class (stub) ---------------- | |
| + | # ---------------- L3: semantic watermarking ---------------- | |
| - | # Illustrative only. Real deployment needs a curated synonym table + NLP-aware insertion. | |
| - | SYNONYM_CLASSES = [ | |
| - | ("begin", "start", "commence"), # 3-ary, encodes log2(3) ≈ 1.58 bits | |
| - | ("large", "big", "substantial"), | |
| - | ("fast", "quick", "rapid"), | |
| - | ("show", "display", "present"), | |
| - | ] | |
| + | # Real implementation lives in oversight_core.semantic. We import it here | |
| + | # so the watermark module is the single entry point for all three layers. | |
| + | try: | |
| + | from . import semantic as _semantic | |
| + | _L3_AVAILABLE = True | |
| + | except ImportError: | |
| + | _L3_AVAILABLE = False | |
| - | def embed_synonyms_stub(text: str, mark_id: bytes) -> str: | |
| + | # ---------------- L2: partial recovery ---------------- | |
| + | ||
| + | def extract_ws_partial( | |
| + | text: str, mark_len_bytes: int = 8 | |
| + | ) -> tuple[Optional[bytes], float, int, int]: | |
| """ | ||
| - | Stub: demonstrates the hook. A production version walks the text with an NLP | |
| - | tagger, finds matches in SYNONYM_CLASSES, and rotates them deterministically | |
| - | based on bits of mark_id. | |
| + | Like extract_ws but returns partial results with confidence. | |
| + | ||
| + | Returns: | |
| + | (best_candidate, confidence, bits_recovered, bits_needed) | |
| + | ||
| + | If all bits are recovered, confidence = 1.0 and best_candidate is exact. | |
| + | If partial, best_candidate has recovered bits filled in and unknown bits | |
| + | set to 0, confidence = bits_recovered / bits_needed. | |
| """ | ||
| - | # Deliberately a no-op placeholder - clearly flagged so it's not mistaken for real. | |
| - | return text | |
| + | needed = mark_len_bytes * 8 | |
| + | bits: list[int] = [] | |
| + | for line in text.split("\n"): | |
| + | if line.endswith(" "): | |
| + | bits.append(0) | |
| + | elif line.endswith("\t"): | |
| + | bits.append(1) | |
| + | if len(bits) >= needed: | |
| + | break | |
| + | recovered = len(bits) | |
| + | if recovered == 0: | |
| + | return None, 0.0, 0, needed | |
| - | def extract_synonyms_stub(text: str) -> Optional[bytes]: | |
| - | return None | |
| + | # Pad with zeros if incomplete | |
| + | padded = bits[:needed] + [0] * max(0, needed - recovered) | |
| + | candidate = _bytes_from_bits(padded[:needed]) | |
| + | confidence = min(recovered, needed) / needed | |
| + | return candidate, confidence, min(recovered, needed), needed | |
| # ---------------- high-level apply/recover ---------------- | ||
| def apply_all(text: str, mark_id: bytes) -> str: | ||
| - | """Apply all available watermark layers to text.""" | |
| - | t = embed_zw(text, mark_id) | |
| + | """ | |
| + | Apply all available watermark layers to text. | |
| + | ||
| + | Layer order matters: L3 (synonym rotation) runs FIRST because it rewrites | |
| + | words. L2 (trailing whitespace) runs second. L1 (zero-width unicode) runs | |
| + | last because it inserts invisible characters that could fragment synonym | |
| + | words if applied earlier. | |
| + | """ | |
| + | if _L3_AVAILABLE: | |
| + | t = _semantic.apply_semantic(text, mark_id) | |
| + | else: | |
| + | t = text | |
| t = embed_ws(t, mark_id) | ||
| - | t = embed_synonyms_stub(t, mark_id) | |
| + | t = embed_zw(t, mark_id) | |
| return t | ||
| @@ -204,5 +237,175 @@ def recover_marks(text: str, mark_len_bytes: int = 8) -> dict: | ||
| return { | ||
| "L1_zero_width": extract_zw(text, mark_len_bytes), | ||
| "L2_whitespace": [m for m in [extract_ws(text, mark_len_bytes)] if m], | ||
| - | "L3_synonyms": [m for m in [extract_synonyms_stub(text)] if m], | |
| + | "L3_synonyms": [], # L3 requires candidate-based verification; see verify_l3 | |
| + | } | |
| + | ||
| + | ||
| + | def verify_l3( | |
| + | text: str, | |
| + | candidate_mark_ids: list[bytes], | |
| + | threshold: float = 0.70, | |
| + | ) -> list[tuple[bytes, float, dict]]: | |
| + | """ | |
| + | Test candidate mark_ids against the semantic marks in text. | |
| + | ||
| + | Returns a list of (mark_id, score, detail_dict) for candidates that | |
| + | score above the threshold. Results are sorted by score descending. | |
| + | """ | |
| + | if not _L3_AVAILABLE: | |
| + | return [] | |
| + | ||
| + | hits = [] | |
| + | for mid in candidate_mark_ids: | |
| + | detail = _semantic.verify_semantic(text, mid) | |
| + | if detail["overall_match"]: | |
| + | hits.append((mid, detail["synonyms_score"], detail)) | |
| + | hits.sort(key=lambda x: x[1], reverse=True) | |
| + | return hits | |
| + | ||
| + | ||
| + | def recover_marks_v2( | |
| + | text: str, | |
| + | candidate_mark_ids: list[bytes] | None = None, | |
| + | mark_len_bytes: int = 8, | |
| + | ) -> dict: | |
| + | """ | |
| + | Enhanced recovery with partial L2, L3 verification, and per-layer diagnostics. | |
| + | ||
| + | Returns a dict with: | |
| + | - layers: per-layer results with confidence | |
| + | - candidates: fused candidate list | |
| + | - diagnostics: human-readable per-layer status strings | |
| + | """ | |
| + | # L1: zero-width extraction | |
| + | l1_marks = extract_zw(text, mark_len_bytes) | |
| + | l1_unique = list(set(l1_marks)) | |
| + | ||
| + | # L2: partial recovery | |
| + | l2_candidate, l2_confidence, l2_bits, l2_needed = extract_ws_partial( | |
| + | text, mark_len_bytes | |
| + | ) | |
| + | l2_marks = [l2_candidate] if l2_candidate and l2_confidence >= 0.5 else [] | |
| + | ||
| + | # L3: candidate-based verification | |
| + | l3_hits: list[tuple[bytes, float, dict]] = [] | |
| + | if candidate_mark_ids and _L3_AVAILABLE: | |
| + | l3_hits = verify_l3(text, candidate_mark_ids) | |
| + | ||
| + | # Build diagnostics | |
| + | diagnostics = [] | |
| + | if l1_unique: | |
| + | diagnostics.append( | |
| + | f"L1: {len(l1_marks)} frames found, " | |
| + | f"{len(l1_unique)} unique mark(s): " | |
| + | + ", ".join(m.hex() for m in l1_unique) | |
| + | ) | |
| + | else: | |
| + | diagnostics.append( | |
| + | "L1: 0 zero-width frames found (invisible chars stripped?)" | |
| + | ) | |
| + | ||
| + | if l2_confidence >= 1.0: | |
| + | diagnostics.append( | |
| + | f"L2: {l2_bits}/{l2_needed} bits recovered (100%), " | |
| + | f"mark: {l2_candidate.hex()}" | |
| + | ) | |
| + | elif l2_confidence > 0: | |
| + | diagnostics.append( | |
| + | f"L2: {l2_bits}/{l2_needed} bits recovered " | |
| + | f"({l2_confidence:.0%} confidence), " | |
| + | f"partial candidate: {l2_candidate.hex()}" | |
| + | ) | |
| + | else: | |
| + | diagnostics.append( | |
| + | "L2: 0 trailing whitespace marks found (whitespace stripped?)" | |
| + | ) | |
| + | ||
| + | if not _L3_AVAILABLE: | |
| + | diagnostics.append("L3: semantic module not available") | |
| + | elif not candidate_mark_ids: | |
| + | diagnostics.append( | |
| + | "L3: no candidate mark_ids provided (query registry first)" | |
| + | ) | |
| + | elif l3_hits: | |
| + | for mid, score, detail in l3_hits: | |
| + | diagnostics.append( | |
| + | f"L3: mark {mid.hex()} matched with score " | |
| + | f"{score:.2f} (synonyms) / " | |
| + | f"{detail['punctuation_hits']} (punctuation), " | |
| + | f"dict={detail['dict_version']}" | |
| + | ) | |
| + | else: | |
| + | diagnostics.append( | |
| + | f"L3: {len(candidate_mark_ids)} candidate(s) tested, " | |
| + | "none matched above threshold" | |
| + | ) | |
| + | ||
| + | # Fuse candidates across layers | |
| + | all_candidates = _fuse_candidates( | |
| + | l1_unique, l2_candidate, l2_confidence, l3_hits | |
| + | ) | |
| + | ||
| + | return { | |
| + | "layers": { | |
| + | "L1_zero_width": l1_unique, | |
| + | "L2_whitespace": l2_marks, | |
| + | "L2_confidence": l2_confidence, | |
| + | "L3_semantic": [(m, s) for m, s, _ in l3_hits], | |
| + | }, | |
| + | "candidates": all_candidates, | |
| + | "diagnostics": diagnostics, | |
| } | ||
| + | ||
| + | ||
| + | def _fuse_candidates( | |
| + | l1_marks: list[bytes], | |
| + | l2_candidate: bytes | None, | |
| + | l2_confidence: float, | |
| + | l3_hits: list[tuple[bytes, float, dict]], | |
| + | ) -> list[tuple[bytes, float, str]]: | |
| + | """ | |
| + | Multi-layer Bayesian fusion: combine evidence from all layers into a | |
| + | single ranked candidate list. | |
| + | ||
| + | Returns list of (mark_id, combined_score, evidence_summary). | |
| + | ||
| + | Scoring: | |
| + | - L1 exact match: 0.95 (high, but not 1.0 because of frame corruption) | |
| + | - L2 exact match: 0.90 (slightly lower, whitespace is fragile) | |
| + | - L2 partial: l2_confidence * 0.60 (scaled down for uncertainty) | |
| + | - L3 match: l3_score * 0.85 (probabilistic, weighted by synonym score) | |
| + | ||
| + | When multiple layers agree on the same mark_id, scores combine: | |
| + | combined = 1 - (1-s1)(1-s2)...(1-sN) | |
| + | This is a standard independence-assumption combination. | |
| + | """ | |
| + | # Collect per-candidate evidence | |
| + | evidence: dict[bytes, list[tuple[float, str]]] = {} | |
| + | ||
| + | for m in l1_marks: | |
| + | evidence.setdefault(m, []).append((0.95, "L1")) | |
| + | ||
| + | if l2_candidate and l2_confidence >= 0.5: | |
| + | l2_score = min(l2_confidence, 1.0) * 0.90 | |
| + | evidence.setdefault(l2_candidate, []).append((l2_score, "L2")) | |
| + | ||
| + | for m, s, _ in l3_hits: | |
| + | evidence.setdefault(m, []).append((s * 0.85, "L3")) | |
| + | ||
| + | # Combine scores per candidate | |
| + | results = [] | |
| + | for mark_id, scores in evidence.items(): | |
| + | if len(scores) == 1: | |
| + | combined = scores[0][0] | |
| + | else: | |
| + | # 1 - product(1 - s_i) | |
| + | combined = 1.0 | |
| + | for s, _ in scores: | |
| + | combined *= (1.0 - s) | |
| + | combined = 1.0 - combined | |
| + | layers_hit = "+".join(lbl for _, lbl in scores) | |
| + | results.append((mark_id, combined, layers_hit)) | |
| + | ||
| + | results.sort(key=lambda x: x[1], reverse=True) | |
| + | return results |
| @@ -0,0 +1,64 @@ | ||
| + | [build-system] | |
| + | requires = ["setuptools>=68.0", "wheel"] | |
| + | build-backend = "setuptools.build_meta" | |
| + | ||
| + | [project] | |
| + | name = "oversight-protocol" | |
| + | version = "0.4.3" | |
| + | description = "Open protocol for cryptographic data provenance, recipient attribution, and leak detection." | |
| + | readme = "README.md" | |
| + | license = {text = "Apache-2.0"} | |
| + | requires-python = ">=3.10" | |
| + | authors = [ | |
| + | {name = "Zion Boggan", email = "zionboggan@gmail.com"}, | |
| + | ] | |
| + | keywords = ["watermark", "provenance", "attribution", "leak-detection", "cryptography", "post-quantum"] | |
| + | classifiers = [ | |
| + | "Development Status :: 4 - Beta", | |
| + | "Intended Audience :: Developers", | |
| + | "Intended Audience :: Information Technology", | |
| + | "License :: OSI Approved :: Apache Software License", | |
| + | "Programming Language :: Python :: 3", | |
| + | "Programming Language :: Python :: 3.10", | |
| + | "Programming Language :: Python :: 3.11", | |
| + | "Programming Language :: Python :: 3.12", | |
| + | "Programming Language :: Python :: 3.13", | |
| + | "Topic :: Security", | |
| + | "Topic :: Security :: Cryptography", | |
| + | ] | |
| + | ||
| + | dependencies = [ | |
| + | "cryptography>=42.0.0", | |
| + | "pynacl>=1.5.0", | |
| + | "httpx>=0.27.0", | |
| + | "rich>=13.0.0", | |
| + | ] | |
| + | ||
| + | [project.optional-dependencies] | |
| + | registry = [ | |
| + | "fastapi>=0.110.0", | |
| + | "uvicorn>=0.29.0", | |
| + | "pydantic>=2.0.0", | |
| + | "python-multipart>=0.0.9", | |
| + | ] | |
| + | formats = [ | |
| + | "Pillow>=10.0.0", | |
| + | "numpy>=1.26.0", | |
| + | "scipy>=1.11.0", | |
| + | "pypdf>=4.0.0", | |
| + | "python-docx>=1.1.0", | |
| + | "imagehash>=4.3.1", | |
| + | ] | |
| + | all = ["oversight-protocol[registry,formats]"] | |
| + | ||
| + | [project.scripts] | |
| + | oversight = "cli.oversight_rich:main" | |
| + | ||
| + | [project.urls] | |
| + | Homepage = "https://oversight-protocol.github.io/oversight/" | |
| + | Repository = "https://github.com/oversight-protocol/oversight" | |
| + | Documentation = "https://oversight-protocol.github.io/oversight/docs/" | |
| + | Issues = "https://github.com/oversight-protocol/oversight/issues" | |
| + | ||
| + | [tool.setuptools.packages.find] | |
| + | include = ["oversight_core*", "cli*", "registry*", "oversight_dns*"] |