Zion Boggan
repos/Oversight/oversight_core/synonyms_v2.py
zionboggan.com ↗
253 lines · python
History for this file →
1
"""
2
oversight_core.synonyms_v2
3
=========================
4
 
5
Expanded synonym table for L3 semantic watermarking, with part-of-speech
6
tagging and URL/code-block skip logic.
7
 
8
v0.2.1 additions over the 27-class v1 list:
9
  - ~150 classes (verbs, adjectives, adverbs, nouns, connectors)
10
  - Part-of-speech tagging via a simple word-level heuristic (no spaCy dep)
11
  - Skips matches inside URLs, file paths, email addresses, code spans
12
  - Match rules: class entries are grouped by POS so we never swap e.g.
13
    "bank" (noun) with "bank" (verb) variants
14
 
15
Bit capacity at typical prose density (one match per ~10 words):
16
   v1 (27 classes):   ~40-70 bits per page
17
   v2 (~150 classes): ~120-180 bits per page
18
This is enough to redundantly encode a 64-bit mark id multiple times per page.
19
 
20
For cryptographer-grade rigor: keep the class table in a separate versioned
21
file (`synonyms_v2.py` here) and tag each manifest with the table version
22
used, so attribution reliably replays the exact variant space.
23
"""
24
 
25
from __future__ import annotations
26
 
27
import re
28
from typing import Iterator, NamedTuple
29
 
30
 
31
class SynonymClass(NamedTuple):
32
    variants: tuple[str, ...]
33
    pos: str
34
 
35
 
36
 
37
VERBS: list[SynonymClass] = [
38
    SynonymClass(("begin", "start", "commence"), "verb"),
39
    SynonymClass(("end", "finish", "conclude"), "verb"),
40
    SynonymClass(("use", "utilize", "employ"), "verb"),
41
    SynonymClass(("make", "create", "produce"), "verb"),
42
    SynonymClass(("get", "obtain", "acquire"), "verb"),
43
    SynonymClass(("find", "locate", "identify"), "verb"),
44
    SynonymClass(("show", "display", "present"), "verb"),
45
    SynonymClass(("tell", "inform", "notify"), "verb"),
46
    SynonymClass(("give", "provide", "supply"), "verb"),
47
    SynonymClass(("help", "assist", "aid"), "verb"),
48
    SynonymClass(("think", "believe", "consider"), "verb"),
49
    SynonymClass(("know", "understand", "recognize"), "verb"),
50
    SynonymClass(("see", "observe", "notice"), "verb"),
51
    SynonymClass(("want", "desire", "need"), "verb"),
52
    SynonymClass(("look", "appear", "seem"), "verb"),
53
    SynonymClass(("ask", "request", "query"), "verb"),
54
    SynonymClass(("send", "transmit", "deliver"), "verb"),
55
    SynonymClass(("allow", "permit", "enable"), "verb"),
56
    SynonymClass(("stop", "halt", "cease"), "verb"),
57
    SynonymClass(("continue", "proceed", "persist"), "verb"),
58
    SynonymClass(("try", "attempt", "endeavor"), "verb"),
59
    SynonymClass(("change", "modify", "alter"), "verb"),
60
    SynonymClass(("add", "append", "include"), "verb"),
61
    SynonymClass(("remove", "delete", "eliminate"), "verb"),
62
    SynonymClass(("check", "verify", "confirm"), "verb"),
63
    SynonymClass(("review", "examine", "evaluate"), "verb"),
64
    SynonymClass(("agree", "concur", "consent"), "verb"),
65
    SynonymClass(("decide", "determine", "resolve"), "verb"),
66
    SynonymClass(("require", "need", "demand"), "verb"),
67
    SynonymClass(("contain", "include", "hold"), "verb"),
68
    SynonymClass(("return", "yield", "give back"), "verb"),
69
    SynonymClass(("create", "generate", "build"), "verb"),
70
    SynonymClass(("destroy", "eliminate", "eradicate"), "verb"),
71
    SynonymClass(("improve", "enhance", "upgrade"), "verb"),
72
    SynonymClass(("protect", "safeguard", "defend"), "verb"),
73
    SynonymClass(("discuss", "address", "cover"), "verb"),
74
    SynonymClass(("explain", "clarify", "describe"), "verb"),
75
    SynonymClass(("propose", "suggest", "recommend"), "verb"),
76
    SynonymClass(("demonstrate", "show", "prove"), "verb"),
77
    SynonymClass(("achieve", "accomplish", "attain"), "verb"),
78
    SynonymClass(("manage", "handle", "administer"), "verb"),
79
    SynonymClass(("develop", "build", "engineer"), "verb"),
80
    SynonymClass(("establish", "set up", "institute"), "verb"),
81
    SynonymClass(("support", "back", "endorse"), "verb"),
82
    SynonymClass(("reject", "refuse", "decline"), "verb"),
83
    SynonymClass(("reduce", "decrease", "lower"), "verb"),
84
    SynonymClass(("increase", "raise", "boost"), "verb"),
85
    SynonymClass(("operate", "run", "function"), "verb"),
86
    SynonymClass(("execute", "perform", "run"), "verb"),
87
    SynonymClass(("investigate", "examine", "research"), "verb"),
88
]
89
 
90
ADJECTIVES: list[SynonymClass] = [
91
    SynonymClass(("big", "large", "substantial"), "adj"),
92
    SynonymClass(("small", "tiny", "minor"), "adj"),
93
    SynonymClass(("fast", "quick", "rapid"), "adj"),
94
    SynonymClass(("slow", "gradual", "deliberate"), "adj"),
95
    SynonymClass(("important", "critical", "significant"), "adj"),
96
    SynonymClass(("hard", "difficult", "challenging"), "adj"),
97
    SynonymClass(("easy", "simple", "straightforward"), "adj"),
98
    SynonymClass(("good", "excellent", "effective"), "adj"),
99
    SynonymClass(("bad", "poor", "inferior"), "adj"),
100
    SynonymClass(("new", "recent", "current"), "adj"),
101
    SynonymClass(("old", "prior", "previous"), "adj"),
102
    SynonymClass(("common", "typical", "standard"), "adj"),
103
    SynonymClass(("rare", "unusual", "uncommon"), "adj"),
104
    SynonymClass(("safe", "secure", "protected"), "adj"),
105
    SynonymClass(("dangerous", "risky", "hazardous"), "adj"),
106
    SynonymClass(("correct", "accurate", "right"), "adj"),
107
    SynonymClass(("wrong", "incorrect", "mistaken"), "adj"),
108
    SynonymClass(("clear", "obvious", "evident"), "adj"),
109
    SynonymClass(("unclear", "vague", "ambiguous"), "adj"),
110
    SynonymClass(("strong", "robust", "powerful"), "adj"),
111
    SynonymClass(("weak", "fragile", "limited"), "adj"),
112
    SynonymClass(("full", "complete", "entire"), "adj"),
113
    SynonymClass(("empty", "vacant", "bare"), "adj"),
114
    SynonymClass(("open", "available", "accessible"), "adj"),
115
    SynonymClass(("closed", "sealed", "restricted"), "adj"),
116
    SynonymClass(("visible", "apparent", "observable"), "adj"),
117
    SynonymClass(("hidden", "concealed", "obscured"), "adj"),
118
    SynonymClass(("public", "open", "unrestricted"), "adj"),
119
    SynonymClass(("private", "confidential", "restricted"), "adj"),
120
    SynonymClass(("complete", "finished", "done"), "adj"),
121
    SynonymClass(("partial", "incomplete", "limited"), "adj"),
122
    SynonymClass(("useful", "helpful", "valuable"), "adj"),
123
    SynonymClass(("useless", "pointless", "ineffective"), "adj"),
124
    SynonymClass(("interesting", "engaging", "compelling"), "adj"),
125
    SynonymClass(("boring", "dull", "tedious"), "adj"),
126
    SynonymClass(("early", "initial", "preliminary"), "adj"),
127
    SynonymClass(("late", "delayed", "overdue"), "adj"),
128
    SynonymClass(("possible", "feasible", "viable"), "adj"),
129
    SynonymClass(("impossible", "unfeasible", "impractical"), "adj"),
130
    SynonymClass(("normal", "typical", "regular"), "adj"),
131
    SynonymClass(("abnormal", "unusual", "atypical"), "adj"),
132
    SynonymClass(("high", "elevated", "significant"), "adj"),
133
    SynonymClass(("low", "reduced", "minimal"), "adj"),
134
]
135
 
136
ADVERBS: list[SynonymClass] = [
137
    SynonymClass(("quickly", "rapidly", "swiftly"), "adv"),
138
    SynonymClass(("slowly", "gradually", "steadily"), "adv"),
139
    SynonymClass(("carefully", "cautiously", "thoroughly"), "adv"),
140
    SynonymClass(("often", "frequently", "regularly"), "adv"),
141
    SynonymClass(("rarely", "seldom", "infrequently"), "adv"),
142
    SynonymClass(("usually", "typically", "generally"), "adv"),
143
    SynonymClass(("sometimes", "occasionally", "periodically"), "adv"),
144
    SynonymClass(("always", "consistently", "invariably"), "adv"),
145
    SynonymClass(("never", "not ever", "at no time"), "adv"),
146
    SynonymClass(("clearly", "obviously", "plainly"), "adv"),
147
    SynonymClass(("exactly", "precisely", "specifically"), "adv"),
148
    SynonymClass(("approximately", "roughly", "around"), "adv"),
149
    SynonymClass(("completely", "entirely", "fully"), "adv"),
150
    SynonymClass(("partially", "partly", "somewhat"), "adv"),
151
    SynonymClass(("immediately", "instantly", "promptly"), "adv"),
152
    SynonymClass(("eventually", "ultimately", "finally"), "adv"),
153
    SynonymClass(("recently", "lately", "newly"), "adv"),
154
    SynonymClass(("currently", "presently", "now"), "adv"),
155
    SynonymClass(("previously", "formerly", "earlier"), "adv"),
156
    SynonymClass(("easily", "readily", "effortlessly"), "adv"),
157
]
158
 
159
NOUNS: list[SynonymClass] = [
160
    SynonymClass(("problem", "issue", "concern"), "noun"),
161
    SynonymClass(("answer", "response", "reply"), "noun"),
162
    SynonymClass(("question", "query", "inquiry"), "noun"),
163
    SynonymClass(("idea", "concept", "notion"), "noun"),
164
    SynonymClass(("plan", "strategy", "approach"), "noun"),
165
    SynonymClass(("result", "outcome", "consequence"), "noun"),
166
    SynonymClass(("method", "approach", "technique"), "noun"),
167
    SynonymClass(("goal", "objective", "aim"), "noun"),
168
    SynonymClass(("change", "modification", "alteration"), "noun"),
169
    SynonymClass(("system", "framework", "structure"), "noun"),
170
    SynonymClass(("process", "procedure", "workflow"), "noun"),
171
    SynonymClass(("feature", "function", "capability"), "noun"),
172
    SynonymClass(("effect", "impact", "influence"), "noun"),
173
    SynonymClass(("cause", "reason", "source"), "noun"),
174
    SynonymClass(("example", "instance", "case"), "noun"),
175
    SynonymClass(("detail", "particular", "specific"), "noun"),
176
    SynonymClass(("summary", "overview", "synopsis"), "noun"),
177
    SynonymClass(("notice", "notification", "alert"), "noun"),
178
    SynonymClass(("record", "log", "entry"), "noun"),
179
    SynonymClass(("report", "document", "write-up"), "noun"),
180
    SynonymClass(("data", "information", "content"), "noun"),
181
    SynonymClass(("value", "amount", "quantity"), "noun"),
182
    SynonymClass(("location", "place", "site"), "noun"),
183
    SynonymClass(("time", "moment", "instant"), "noun"),
184
    SynonymClass(("benefit", "advantage", "gain"), "noun"),
185
    SynonymClass(("risk", "hazard", "threat"), "noun"),
186
    SynonymClass(("error", "mistake", "flaw"), "noun"),
187
    SynonymClass(("need", "requirement", "necessity"), "noun"),
188
    SynonymClass(("request", "application", "petition"), "noun"),
189
    SynonymClass(("opportunity", "chance", "possibility"), "noun"),
190
]
191
 
192
CONNECTORS: list[SynonymClass] = [
193
    SynonymClass(("however", "nevertheless", "nonetheless"), "conj"),
194
    SynonymClass(("therefore", "consequently", "thus"), "conj"),
195
    SynonymClass(("also", "additionally", "furthermore"), "conj"),
196
    SynonymClass(("but", "yet", "though"), "conj"),
197
    SynonymClass(("because", "since", "as"), "conj"),
198
    SynonymClass(("although", "while", "whereas"), "conj"),
199
    SynonymClass(("similarly", "likewise", "comparably"), "conj"),
200
    SynonymClass(("instead", "rather", "alternatively"), "conj"),
201
]
202
 
203
 
204
ALL_CLASSES: list[SynonymClass] = VERBS + ADJECTIVES + ADVERBS + NOUNS + CONNECTORS
205
 
206
_LOOKUP: dict[str, tuple[int, int, str]] = {}
207
for ci, cls in enumerate(ALL_CLASSES):
208
    for vi, word in enumerate(cls.variants):
209
        if " " not in word:
210
            if word.lower() not in _LOOKUP:
211
                _LOOKUP[word.lower()] = (ci, vi, cls.pos)
212
 
213
 
214
SYNONYM_COUNT = len(ALL_CLASSES)
215
 
216
 
217
 
218
_SKIP_PATTERNS = [
219
    re.compile(r"https?://\S+"),
220
    re.compile(r"\b[\w.+-]+@[\w.-]+\.\w+\b"),
221
    re.compile(r"`[^`]+`"),
222
    re.compile(r"```[\s\S]*?```"),
223
    re.compile(r"(?:^|\s)(?:/|~/|\./)[^\s]+"),
224
    re.compile(r"\b[A-Za-z]:\\\\[^\s]+"),
225
    re.compile(r"\b[A-Fa-f0-9]{16,}\b"),
226
    re.compile(r"\b[A-Za-z0-9+/]{32,}={0,2}\b"),
227
]
228
 
229
 
230
def iter_matchable_words(text: str) -> Iterator[tuple[int, int, str, tuple[int, int, str]]]:
231
    """
232
    Walk text and yield (start, end, word, (class_index, variant_index, pos))
233
    for each word that's in the synonym table AND not inside a skip region.
234
 
235
    This is the production entry point for L3 embedding and verification.
236
    """
237
    skip_mask = [False] * len(text)
238
    for pat in _SKIP_PATTERNS:
239
        for m in pat.finditer(text):
240
            for i in range(m.start(), m.end()):
241
                if i < len(skip_mask):
242
                    skip_mask[i] = True
243
 
244
    word_re = re.compile(r"\b([A-Za-z]+)\b")
245
    for m in word_re.finditer(text):
246
        if any(skip_mask[i] for i in range(m.start(), m.end())):
247
            continue
248
        word = m.group(1)
249
        if word.isupper() or (word[:1].isupper() and m.start() != 0):
250
            continue
251
        key = m.group(1).lower()
252
        if key in _LOOKUP:
253
            yield m.start(), m.end(), word, _LOOKUP[key]