| 1 | """ |
| 2 | oversight_core.synonyms_v2 |
| 3 | ========================= |
| 4 | |
| 5 | Expanded synonym table for L3 semantic watermarking, with part-of-speech |
| 6 | tagging and URL/code-block skip logic. |
| 7 | |
| 8 | v0.2.1 additions over the 27-class v1 list: |
| 9 | - ~150 classes (verbs, adjectives, adverbs, nouns, connectors) |
| 10 | - Part-of-speech tagging via a simple word-level heuristic (no spaCy dep) |
| 11 | - Skips matches inside URLs, file paths, email addresses, code spans |
| 12 | - Match rules: class entries are grouped by POS so we never swap e.g. |
| 13 | "bank" (noun) with "bank" (verb) variants |
| 14 | |
| 15 | Bit capacity at typical prose density (one match per ~10 words): |
| 16 | v1 (27 classes): ~40-70 bits per page |
| 17 | v2 (~150 classes): ~120-180 bits per page |
| 18 | This is enough to redundantly encode a 64-bit mark id multiple times per page. |
| 19 | |
| 20 | For cryptographer-grade rigor: keep the class table in a separate versioned |
| 21 | file (`synonyms_v2.py` here) and tag each manifest with the table version |
| 22 | used, so attribution reliably replays the exact variant space. |
| 23 | """ |
| 24 | |
| 25 | from __future__ import annotations |
| 26 | |
| 27 | import re |
| 28 | from typing import Iterator, NamedTuple |
| 29 | |
| 30 | |
| 31 | class SynonymClass(NamedTuple): |
| 32 | variants: tuple[str, ...] |
| 33 | pos: str |
| 34 | |
| 35 | |
| 36 | |
| 37 | VERBS: list[SynonymClass] = [ |
| 38 | SynonymClass(("begin", "start", "commence"), "verb"), |
| 39 | SynonymClass(("end", "finish", "conclude"), "verb"), |
| 40 | SynonymClass(("use", "utilize", "employ"), "verb"), |
| 41 | SynonymClass(("make", "create", "produce"), "verb"), |
| 42 | SynonymClass(("get", "obtain", "acquire"), "verb"), |
| 43 | SynonymClass(("find", "locate", "identify"), "verb"), |
| 44 | SynonymClass(("show", "display", "present"), "verb"), |
| 45 | SynonymClass(("tell", "inform", "notify"), "verb"), |
| 46 | SynonymClass(("give", "provide", "supply"), "verb"), |
| 47 | SynonymClass(("help", "assist", "aid"), "verb"), |
| 48 | SynonymClass(("think", "believe", "consider"), "verb"), |
| 49 | SynonymClass(("know", "understand", "recognize"), "verb"), |
| 50 | SynonymClass(("see", "observe", "notice"), "verb"), |
| 51 | SynonymClass(("want", "desire", "need"), "verb"), |
| 52 | SynonymClass(("look", "appear", "seem"), "verb"), |
| 53 | SynonymClass(("ask", "request", "query"), "verb"), |
| 54 | SynonymClass(("send", "transmit", "deliver"), "verb"), |
| 55 | SynonymClass(("allow", "permit", "enable"), "verb"), |
| 56 | SynonymClass(("stop", "halt", "cease"), "verb"), |
| 57 | SynonymClass(("continue", "proceed", "persist"), "verb"), |
| 58 | SynonymClass(("try", "attempt", "endeavor"), "verb"), |
| 59 | SynonymClass(("change", "modify", "alter"), "verb"), |
| 60 | SynonymClass(("add", "append", "include"), "verb"), |
| 61 | SynonymClass(("remove", "delete", "eliminate"), "verb"), |
| 62 | SynonymClass(("check", "verify", "confirm"), "verb"), |
| 63 | SynonymClass(("review", "examine", "evaluate"), "verb"), |
| 64 | SynonymClass(("agree", "concur", "consent"), "verb"), |
| 65 | SynonymClass(("decide", "determine", "resolve"), "verb"), |
| 66 | SynonymClass(("require", "need", "demand"), "verb"), |
| 67 | SynonymClass(("contain", "include", "hold"), "verb"), |
| 68 | SynonymClass(("return", "yield", "give back"), "verb"), |
| 69 | SynonymClass(("create", "generate", "build"), "verb"), |
| 70 | SynonymClass(("destroy", "eliminate", "eradicate"), "verb"), |
| 71 | SynonymClass(("improve", "enhance", "upgrade"), "verb"), |
| 72 | SynonymClass(("protect", "safeguard", "defend"), "verb"), |
| 73 | SynonymClass(("discuss", "address", "cover"), "verb"), |
| 74 | SynonymClass(("explain", "clarify", "describe"), "verb"), |
| 75 | SynonymClass(("propose", "suggest", "recommend"), "verb"), |
| 76 | SynonymClass(("demonstrate", "show", "prove"), "verb"), |
| 77 | SynonymClass(("achieve", "accomplish", "attain"), "verb"), |
| 78 | SynonymClass(("manage", "handle", "administer"), "verb"), |
| 79 | SynonymClass(("develop", "build", "engineer"), "verb"), |
| 80 | SynonymClass(("establish", "set up", "institute"), "verb"), |
| 81 | SynonymClass(("support", "back", "endorse"), "verb"), |
| 82 | SynonymClass(("reject", "refuse", "decline"), "verb"), |
| 83 | SynonymClass(("reduce", "decrease", "lower"), "verb"), |
| 84 | SynonymClass(("increase", "raise", "boost"), "verb"), |
| 85 | SynonymClass(("operate", "run", "function"), "verb"), |
| 86 | SynonymClass(("execute", "perform", "run"), "verb"), |
| 87 | SynonymClass(("investigate", "examine", "research"), "verb"), |
| 88 | ] |
| 89 | |
| 90 | ADJECTIVES: list[SynonymClass] = [ |
| 91 | SynonymClass(("big", "large", "substantial"), "adj"), |
| 92 | SynonymClass(("small", "tiny", "minor"), "adj"), |
| 93 | SynonymClass(("fast", "quick", "rapid"), "adj"), |
| 94 | SynonymClass(("slow", "gradual", "deliberate"), "adj"), |
| 95 | SynonymClass(("important", "critical", "significant"), "adj"), |
| 96 | SynonymClass(("hard", "difficult", "challenging"), "adj"), |
| 97 | SynonymClass(("easy", "simple", "straightforward"), "adj"), |
| 98 | SynonymClass(("good", "excellent", "effective"), "adj"), |
| 99 | SynonymClass(("bad", "poor", "inferior"), "adj"), |
| 100 | SynonymClass(("new", "recent", "current"), "adj"), |
| 101 | SynonymClass(("old", "prior", "previous"), "adj"), |
| 102 | SynonymClass(("common", "typical", "standard"), "adj"), |
| 103 | SynonymClass(("rare", "unusual", "uncommon"), "adj"), |
| 104 | SynonymClass(("safe", "secure", "protected"), "adj"), |
| 105 | SynonymClass(("dangerous", "risky", "hazardous"), "adj"), |
| 106 | SynonymClass(("correct", "accurate", "right"), "adj"), |
| 107 | SynonymClass(("wrong", "incorrect", "mistaken"), "adj"), |
| 108 | SynonymClass(("clear", "obvious", "evident"), "adj"), |
| 109 | SynonymClass(("unclear", "vague", "ambiguous"), "adj"), |
| 110 | SynonymClass(("strong", "robust", "powerful"), "adj"), |
| 111 | SynonymClass(("weak", "fragile", "limited"), "adj"), |
| 112 | SynonymClass(("full", "complete", "entire"), "adj"), |
| 113 | SynonymClass(("empty", "vacant", "bare"), "adj"), |
| 114 | SynonymClass(("open", "available", "accessible"), "adj"), |
| 115 | SynonymClass(("closed", "sealed", "restricted"), "adj"), |
| 116 | SynonymClass(("visible", "apparent", "observable"), "adj"), |
| 117 | SynonymClass(("hidden", "concealed", "obscured"), "adj"), |
| 118 | SynonymClass(("public", "open", "unrestricted"), "adj"), |
| 119 | SynonymClass(("private", "confidential", "restricted"), "adj"), |
| 120 | SynonymClass(("complete", "finished", "done"), "adj"), |
| 121 | SynonymClass(("partial", "incomplete", "limited"), "adj"), |
| 122 | SynonymClass(("useful", "helpful", "valuable"), "adj"), |
| 123 | SynonymClass(("useless", "pointless", "ineffective"), "adj"), |
| 124 | SynonymClass(("interesting", "engaging", "compelling"), "adj"), |
| 125 | SynonymClass(("boring", "dull", "tedious"), "adj"), |
| 126 | SynonymClass(("early", "initial", "preliminary"), "adj"), |
| 127 | SynonymClass(("late", "delayed", "overdue"), "adj"), |
| 128 | SynonymClass(("possible", "feasible", "viable"), "adj"), |
| 129 | SynonymClass(("impossible", "unfeasible", "impractical"), "adj"), |
| 130 | SynonymClass(("normal", "typical", "regular"), "adj"), |
| 131 | SynonymClass(("abnormal", "unusual", "atypical"), "adj"), |
| 132 | SynonymClass(("high", "elevated", "significant"), "adj"), |
| 133 | SynonymClass(("low", "reduced", "minimal"), "adj"), |
| 134 | ] |
| 135 | |
| 136 | ADVERBS: list[SynonymClass] = [ |
| 137 | SynonymClass(("quickly", "rapidly", "swiftly"), "adv"), |
| 138 | SynonymClass(("slowly", "gradually", "steadily"), "adv"), |
| 139 | SynonymClass(("carefully", "cautiously", "thoroughly"), "adv"), |
| 140 | SynonymClass(("often", "frequently", "regularly"), "adv"), |
| 141 | SynonymClass(("rarely", "seldom", "infrequently"), "adv"), |
| 142 | SynonymClass(("usually", "typically", "generally"), "adv"), |
| 143 | SynonymClass(("sometimes", "occasionally", "periodically"), "adv"), |
| 144 | SynonymClass(("always", "consistently", "invariably"), "adv"), |
| 145 | SynonymClass(("never", "not ever", "at no time"), "adv"), |
| 146 | SynonymClass(("clearly", "obviously", "plainly"), "adv"), |
| 147 | SynonymClass(("exactly", "precisely", "specifically"), "adv"), |
| 148 | SynonymClass(("approximately", "roughly", "around"), "adv"), |
| 149 | SynonymClass(("completely", "entirely", "fully"), "adv"), |
| 150 | SynonymClass(("partially", "partly", "somewhat"), "adv"), |
| 151 | SynonymClass(("immediately", "instantly", "promptly"), "adv"), |
| 152 | SynonymClass(("eventually", "ultimately", "finally"), "adv"), |
| 153 | SynonymClass(("recently", "lately", "newly"), "adv"), |
| 154 | SynonymClass(("currently", "presently", "now"), "adv"), |
| 155 | SynonymClass(("previously", "formerly", "earlier"), "adv"), |
| 156 | SynonymClass(("easily", "readily", "effortlessly"), "adv"), |
| 157 | ] |
| 158 | |
| 159 | NOUNS: list[SynonymClass] = [ |
| 160 | SynonymClass(("problem", "issue", "concern"), "noun"), |
| 161 | SynonymClass(("answer", "response", "reply"), "noun"), |
| 162 | SynonymClass(("question", "query", "inquiry"), "noun"), |
| 163 | SynonymClass(("idea", "concept", "notion"), "noun"), |
| 164 | SynonymClass(("plan", "strategy", "approach"), "noun"), |
| 165 | SynonymClass(("result", "outcome", "consequence"), "noun"), |
| 166 | SynonymClass(("method", "approach", "technique"), "noun"), |
| 167 | SynonymClass(("goal", "objective", "aim"), "noun"), |
| 168 | SynonymClass(("change", "modification", "alteration"), "noun"), |
| 169 | SynonymClass(("system", "framework", "structure"), "noun"), |
| 170 | SynonymClass(("process", "procedure", "workflow"), "noun"), |
| 171 | SynonymClass(("feature", "function", "capability"), "noun"), |
| 172 | SynonymClass(("effect", "impact", "influence"), "noun"), |
| 173 | SynonymClass(("cause", "reason", "source"), "noun"), |
| 174 | SynonymClass(("example", "instance", "case"), "noun"), |
| 175 | SynonymClass(("detail", "particular", "specific"), "noun"), |
| 176 | SynonymClass(("summary", "overview", "synopsis"), "noun"), |
| 177 | SynonymClass(("notice", "notification", "alert"), "noun"), |
| 178 | SynonymClass(("record", "log", "entry"), "noun"), |
| 179 | SynonymClass(("report", "document", "write-up"), "noun"), |
| 180 | SynonymClass(("data", "information", "content"), "noun"), |
| 181 | SynonymClass(("value", "amount", "quantity"), "noun"), |
| 182 | SynonymClass(("location", "place", "site"), "noun"), |
| 183 | SynonymClass(("time", "moment", "instant"), "noun"), |
| 184 | SynonymClass(("benefit", "advantage", "gain"), "noun"), |
| 185 | SynonymClass(("risk", "hazard", "threat"), "noun"), |
| 186 | SynonymClass(("error", "mistake", "flaw"), "noun"), |
| 187 | SynonymClass(("need", "requirement", "necessity"), "noun"), |
| 188 | SynonymClass(("request", "application", "petition"), "noun"), |
| 189 | SynonymClass(("opportunity", "chance", "possibility"), "noun"), |
| 190 | ] |
| 191 | |
| 192 | CONNECTORS: list[SynonymClass] = [ |
| 193 | SynonymClass(("however", "nevertheless", "nonetheless"), "conj"), |
| 194 | SynonymClass(("therefore", "consequently", "thus"), "conj"), |
| 195 | SynonymClass(("also", "additionally", "furthermore"), "conj"), |
| 196 | SynonymClass(("but", "yet", "though"), "conj"), |
| 197 | SynonymClass(("because", "since", "as"), "conj"), |
| 198 | SynonymClass(("although", "while", "whereas"), "conj"), |
| 199 | SynonymClass(("similarly", "likewise", "comparably"), "conj"), |
| 200 | SynonymClass(("instead", "rather", "alternatively"), "conj"), |
| 201 | ] |
| 202 | |
| 203 | |
| 204 | ALL_CLASSES: list[SynonymClass] = VERBS + ADJECTIVES + ADVERBS + NOUNS + CONNECTORS |
| 205 | |
| 206 | _LOOKUP: dict[str, tuple[int, int, str]] = {} |
| 207 | for ci, cls in enumerate(ALL_CLASSES): |
| 208 | for vi, word in enumerate(cls.variants): |
| 209 | if " " not in word: |
| 210 | if word.lower() not in _LOOKUP: |
| 211 | _LOOKUP[word.lower()] = (ci, vi, cls.pos) |
| 212 | |
| 213 | |
| 214 | SYNONYM_COUNT = len(ALL_CLASSES) |
| 215 | |
| 216 | |
| 217 | |
| 218 | _SKIP_PATTERNS = [ |
| 219 | re.compile(r"https?://\S+"), |
| 220 | re.compile(r"\b[\w.+-]+@[\w.-]+\.\w+\b"), |
| 221 | re.compile(r"`[^`]+`"), |
| 222 | re.compile(r"```[\s\S]*?```"), |
| 223 | re.compile(r"(?:^|\s)(?:/|~/|\./)[^\s]+"), |
| 224 | re.compile(r"\b[A-Za-z]:\\\\[^\s]+"), |
| 225 | re.compile(r"\b[A-Fa-f0-9]{16,}\b"), |
| 226 | re.compile(r"\b[A-Za-z0-9+/]{32,}={0,2}\b"), |
| 227 | ] |
| 228 | |
| 229 | |
| 230 | def iter_matchable_words(text: str) -> Iterator[tuple[int, int, str, tuple[int, int, str]]]: |
| 231 | """ |
| 232 | Walk text and yield (start, end, word, (class_index, variant_index, pos)) |
| 233 | for each word that's in the synonym table AND not inside a skip region. |
| 234 | |
| 235 | This is the production entry point for L3 embedding and verification. |
| 236 | """ |
| 237 | skip_mask = [False] * len(text) |
| 238 | for pat in _SKIP_PATTERNS: |
| 239 | for m in pat.finditer(text): |
| 240 | for i in range(m.start(), m.end()): |
| 241 | if i < len(skip_mask): |
| 242 | skip_mask[i] = True |
| 243 | |
| 244 | word_re = re.compile(r"\b([A-Za-z]+)\b") |
| 245 | for m in word_re.finditer(text): |
| 246 | if any(skip_mask[i] for i in range(m.start(), m.end())): |
| 247 | continue |
| 248 | word = m.group(1) |
| 249 | if word.isupper() or (word[:1].isupper() and m.start() != 0): |
| 250 | continue |
| 251 | key = m.group(1).lower() |
| 252 | if key in _LOOKUP: |
| 253 | yield m.start(), m.end(), word, _LOOKUP[key] |