Zion Boggan
repos/Oversight/oversight_core/formats/text.py
zionboggan.com ↗
65 lines · python
History for this file →
1
"""
2
oversight_core.formats.text - text format adapter.
3
 
4
Wraps the three watermark layers:
5
    L1 zero-width unicode    (watermark.py)
6
    L2 trailing whitespace   (watermark.py)
7
    L3 semantic              (semantic.py)
8
 
9
into a single apply/recover API.
10
"""
11
 
12
from __future__ import annotations
13
 
14
from .. import watermark, l3_policy, semantic
15
 
16
 
17
def apply(text: str, mark_id: bytes, layers: tuple[str, ...] = ("L1", "L2")) -> str:
18
    """Apply all requested watermark layers to UTF-8 text.
19
 
20
    Layer order matters: L3 rewrites visible words, so it must run before the
21
    L2/L1 steganographic layers that append whitespace and zero-width chars.
22
    """
23
    t = text
24
    if "L3" in layers:
25
        t = l3_policy.apply_l3_safe(t, mark_id, mode="full")
26
    if "L2" in layers:
27
        t = watermark.embed_ws(t, mark_id)
28
    if "L1" in layers:
29
        t = watermark.embed_zw(t, mark_id)
30
    return t
31
 
32
 
33
def recover(text: str, candidate_mark_ids: list[bytes] = None) -> dict:
34
    """
35
    Recover attribution from text.
36
 
37
    Returns:
38
      {
39
        "L1_hits": [mark_id_hex, ...],
40
        "L2_hits": [mark_id_hex, ...],
41
        "L3_matches": [{"mark_id": ..., "score": ..., "match": True/False}, ...]
42
      }
43
 
44
    L1 and L2 recover the mark_id directly from invisible content.
45
    L3 requires candidate_mark_ids (usually from the registry) to verify against.
46
    """
47
    out = {
48
        "L1_hits": [m.hex() for m in watermark.extract_zw(text)],
49
        "L2_hits": [],
50
        "L3_matches": [],
51
    }
52
    ws = watermark.extract_ws(text)
53
    if ws:
54
        out["L2_hits"].append(ws.hex())
55
 
56
    if candidate_mark_ids:
57
        for cm in candidate_mark_ids:
58
            result = semantic.verify_semantic(text, cm)
59
            if result["overall_match"]:
60
                out["L3_matches"].append({
61
                    "mark_id": cm.hex(),
62
                    "syn_score": result["synonyms_score"],
63
                    "punct_score": result["punctuation_score"],
64
                })
65
    return out