Zion Boggan
repos/Oversight/oversight_core/semantic.py
zionboggan.com ↗
684 lines · python
History for this file →
1
"""
2
oversight_core.semantic
3
======================
4
 
5
L3 semantic watermarking - the airgap-strip survivor.
6
 
7
Unlike L1 (zero-width unicode) and L2 (whitespace) which die the moment an
8
attacker runs a normalization pass, semantic marks are encoded in the *choice
9
of words* themselves. An attacker who opens the file in an airgapped VM and
10
strips invisible characters still has the watermark, because the words ARE
11
the watermark.
12
 
13
This module implements three real techniques:
14
 
15
  T1 - Synonym-class rotation
16
      For each synonym class (e.g., {begin, start, commence}), the choice made
17
      in each instance encodes bits of the mark_id. The attacker cannot tell
18
      whether "begin" or "start" was the original without access to the source,
19
      so stripping requires paraphrasing every candidate word - which damages
20
      the document and still doesn't defeat the mark if redundancy is high.
21
 
22
  T2 - Punctuation-style fingerprint
23
      Deterministic per-recipient choices of:
24
        - Oxford comma (on/off) at each list
25
        - Em dash vs en dash in parenthetical breaks
26
        - Straight vs curly quotes
27
      These survive copy-paste. They survive OCR (which usually preserves the
28
      glyph). They can be reliably extracted from any plaintext copy.
29
 
30
  T3 - Sentence-level structural marks
31
      For lists/enumerations, the ordering of items (when semantically
32
      neutral) encodes bits. For sentences, the choice of
33
      active-vs-passive voice in N eligible sentences encodes bits.
34
 
35
All three survive UTF-8 normalization, invisible-char stripping, whitespace
36
normalization, format conversion, and most OCR passes.
37
 
38
They do NOT survive aggressive manual paraphrasing by a human. That's the
39
fundamental limit of semantic watermarking: you cannot defend against
40
rewriting in someone else's words. You CAN make automated stripping
41
computationally expensive and attributable.
42
 
43
Bit capacity notes:
44
    T1: ~log2(classes_per_phrase) bits per insertion point, ~15-40 bits per page
45
    T2: ~3-5 bits per page (Oxford comma + dashes + quotes)
46
    T3: 1 bit per re-orderable list, 1 bit per voice-eligible sentence
47
 
48
Total realistic capacity: 30-80 bits per page of normal prose.
49
A 64-bit mark ID needs about one page of text to encode redundantly.
50
"""
51
 
52
from __future__ import annotations
53
 
54
import hashlib
55
import re
56
from typing import Optional
57
 
58
 
59
 
60
try:
61
    from .synonyms_v2 import (
62
        ALL_CLASSES as _V2_CLASSES,
63
        iter_matchable_words,
64
        SYNONYM_COUNT as _V2_COUNT,
65
    )
66
    SYNONYMS_V2_AVAILABLE = True
67
except ImportError:
68
    SYNONYMS_V2_AVAILABLE = False
69
 
70
 
71
SYNONYM_CLASSES = [
72
    ("begin", "start", "commence"),
73
    ("large", "big", "substantial"),
74
    ("fast", "quick", "rapid"),
75
    ("show", "display", "present"),
76
    ("use", "utilize", "employ"),
77
    ("help", "assist", "aid"),
78
    ("make", "create", "produce"),
79
    ("get", "obtain", "acquire"),
80
    ("find", "locate", "identify"),
81
    ("tell", "inform", "notify"),
82
    ("give", "provide", "supply"),
83
    ("end", "finish", "conclude"),
84
    ("small", "tiny", "minor"),
85
    ("slow", "gradual", "deliberate"),
86
    ("important", "critical", "significant"),
87
    ("hard", "difficult", "challenging"),
88
    ("easy", "simple", "straightforward"),
89
    ("problem", "issue", "concern"),
90
    ("answer", "response", "reply"),
91
    ("question", "query", "inquiry"),
92
    ("idea", "concept", "notion"),
93
    ("plan", "strategy", "approach"),
94
    ("result", "outcome", "consequence"),
95
    ("however", "nevertheless", "nonetheless"),
96
    ("therefore", "consequently", "thus"),
97
    ("also", "additionally", "furthermore"),
98
    ("but", "yet", "though"),
99
]
100
 
101
 
102
def _build_synonym_lookup() -> dict[str, tuple[int, int]]:
103
    """v1 legacy lookup used when the caller explicitly asks for v1."""
104
    lookup: dict[str, tuple[int, int]] = {}
105
    for ci, cls in enumerate(SYNONYM_CLASSES):
106
        for vi, word in enumerate(cls):
107
            lookup[word.lower()] = (ci, vi)
108
    return lookup
109
 
110
 
111
SYNONYM_LOOKUP = _build_synonym_lookup()
112
 
113
 
114
def _bits_of(data: bytes) -> list[int]:
115
    out = []
116
    for byte in data:
117
        for i in range(8):
118
            out.append((byte >> (7 - i)) & 1)
119
    return out
120
 
121
 
122
def _bytes_from_bits(bits: list[int]) -> bytes:
123
    n = (len(bits) // 8) * 8
124
    out = bytearray()
125
    for i in range(0, n, 8):
126
        b = 0
127
        for j in range(8):
128
            b = (b << 1) | (bits[i + j] & 1)
129
        out.append(b)
130
    return bytes(out)
131
 
132
 
133
def _mark_id_to_variant_sequence(
134
    mark_id: bytes, n_instances: int, class_size: int = 3
135
) -> list[int]:
136
    """
137
    Derive a deterministic sequence of variant indices from mark_id.
138
    Uses HKDF-like expansion via SHA-256 over (mark_id || counter).
139
    Each variant index is in [0, class_size).
140
    """
141
    out: list[int] = []
142
    ctr = 0
143
    while len(out) < n_instances:
144
        h = hashlib.sha256(mark_id + ctr.to_bytes(4, "big")).digest()
145
        for byte in h:
146
            out.append(byte % class_size)
147
            if len(out) >= n_instances:
148
                break
149
        ctr += 1
150
    return out
151
 
152
 
153
def _case_preserve(replacement: str, original: str) -> str:
154
    """Match capitalization pattern: Title, UPPER, or lower."""
155
    if original.isupper():
156
        return replacement.upper()
157
    if original[:1].isupper():
158
        return replacement[:1].upper() + replacement[1:]
159
    return replacement.lower()
160
 
161
 
162
_WORD_RE = re.compile(r"\b([A-Za-z]+)\b")
163
 
164
_ZW_CHARS = "\u200b\u200c\u200d\ufeff"
165
 
166
 
167
def _strip_zw(text: str) -> str:
168
    for ch in _ZW_CHARS:
169
        text = text.replace(ch, "")
170
    return text
171
 
172
 
173
def embed_synonyms(text: str, mark_id: bytes, min_instances: int = 8) -> str:
174
    """
175
    Walk the text, and at every word that is a member of a known synonym class,
176
    replace it with the class variant indicated by the mark_id-derived sequence.
177
 
178
    If the text has fewer than `min_instances` synonym-class hits, the function
179
    returns the text unchanged and logs to stderr (no silent partial marks).
180
 
181
    Note: best applied BEFORE L1 zero-width marks. If you apply it after L1,
182
    the word-boundary regex may miss synonym words fragmented by ZW chars
183
    (and we don't transparently strip ZW during embedding because we don't
184
    want to destroy the L1 marks).
185
    """
186
    matches: list[tuple[int, int, int, int, str]] = []
187
    for m in _WORD_RE.finditer(text):
188
        w = m.group(1)
189
        key = w.lower()
190
        if key in SYNONYM_LOOKUP:
191
            ci, vi = SYNONYM_LOOKUP[key]
192
            matches.append((m.start(), m.end(), ci, vi, w))
193
 
194
    if len(matches) < min_instances:
195
        import sys
196
        print(
197
            f"[semantic] warning: only {len(matches)} synonym-class hits "
198
            f"(need {min_instances}); skipping L3",
199
            file=sys.stderr,
200
        )
201
        return text
202
 
203
    variants = _mark_id_to_variant_sequence(mark_id, len(matches), class_size=3)
204
 
205
    out: list[str] = []
206
    cursor = 0
207
    for (start, end, ci, _orig_vi, orig_word), target_vi in zip(matches, variants):
208
        cls = SYNONYM_CLASSES[ci]
209
        target_vi = target_vi % len(cls)
210
        replacement = _case_preserve(cls[target_vi], orig_word)
211
        out.append(text[cursor:start])
212
        out.append(replacement)
213
        cursor = end
214
    out.append(text[cursor:])
215
    return "".join(out)
216
 
217
 
218
def extract_synonyms_candidate(text: str, mark_len_bytes: int = 8) -> list[bytes]:
219
    """
220
    Attempt to recover mark_id from synonym choices in the text.
221
 
222
    We don't know the original text, so we can't directly recover bits.
223
    Instead, we check candidate mark_ids by:
224
      1. Computing the expected variant sequence for each candidate
225
      2. Checking how many match the text's actual variants
226
 
227
    Caller supplies candidate mark_ids (usually from the registry). This
228
    function returns the subset that match above a threshold.
229
 
230
    For the MVP, we instead return a *fingerprint* of the actual variant
231
    choices observed; the registry can match fingerprints against stored ones.
232
    """
233
    seq = []
234
    for m in _WORD_RE.finditer(text):
235
        key = m.group(1).lower()
236
        if key in SYNONYM_LOOKUP:
237
            seq.append(SYNONYM_LOOKUP[key])
238
    if not seq:
239
        return []
240
    fp = hashlib.sha256(repr(seq).encode()).digest()
241
    return [fp]
242
 
243
 
244
def verify_synonyms_match(
245
    text: str, candidate_mark_id: bytes, threshold: float = 0.70
246
) -> tuple[bool, float]:
247
    """
248
    Given a candidate mark_id, compute what variant sequence it would have
249
    produced, and compare to the text's actual variant sequence.
250
 
251
    Returns (match, score). Score is fraction of matching variants.
252
    Threshold 0.70 tolerates some paraphrasing while still attributing.
253
 
254
    Automatically strips zero-width unicode (L1 watermark residue) before
255
    matching, so semantic verification works whether or not L1 was applied
256
    and whether or not an attacker has stripped invisibles.
257
    """
258
    text = _strip_zw(text)
259
    actual: list[tuple[int, int]] = []
260
    for m in _WORD_RE.finditer(text):
261
        key = m.group(1).lower()
262
        if key in SYNONYM_LOOKUP:
263
            actual.append(SYNONYM_LOOKUP[key])
264
 
265
    if not actual:
266
        return False, 0.0
267
 
268
    expected_variants = _mark_id_to_variant_sequence(candidate_mark_id, len(actual), 3)
269
    matches = 0
270
    counted = 0
271
    for (ci, actual_vi), expected_vi in zip(actual, expected_variants):
272
        cls = SYNONYM_CLASSES[ci]
273
        counted += 1
274
        if (expected_vi % len(cls)) == actual_vi:
275
            matches += 1
276
 
277
    score = matches / counted if counted else 0.0
278
    return (score >= threshold), score
279
 
280
 
281
 
282
 
283
def _bit_for(mark_id: bytes, bit_index: int) -> int:
284
    """Deterministic bit selector from mark_id."""
285
    byte = mark_id[bit_index % len(mark_id)]
286
    return (byte >> (bit_index % 8)) & 1
287
 
288
 
289
def embed_punctuation(text: str, mark_id: bytes) -> str:
290
    """
291
    Apply punctuation-style marks to text deterministically.
292
 
293
    Idempotent: running twice produces the same output.
294
    """
295
    b0 = _bit_for(mark_id, 0)
296
    b1 = _bit_for(mark_id, 1)
297
    b2 = _bit_for(mark_id, 2)
298
 
299
    EM_DASH = "\u2014"
300
    OPEN_Q = "\u201c"
301
    CLOSE_Q = "\u201d"
302
 
303
    if b0:
304
        text = re.sub(r"(\w+), (\w+) and ", r"\1, \2, and ", text)
305
    else:
306
        text = re.sub(r"(\w+), (\w+), and ", r"\1, \2 and ", text)
307
 
308
    if b1:
309
        text = text.replace(" -- ", f" {EM_DASH} ")
310
        text = re.sub(r"(\w)--(\w)", lambda m: m.group(1) + EM_DASH + m.group(2), text)
311
    else:
312
        text = text.replace(f" {EM_DASH} ", " -- ")
313
        text = re.sub(r"(\w)" + EM_DASH + r"(\w)", r"\1--\2", text)
314
 
315
    if b2:
316
        quote_state = [1]
317
        def _curly(_m):
318
            quote_state[0] = 1 - quote_state[0]
319
            return OPEN_Q if quote_state[0] else CLOSE_Q
320
        text = re.sub(r'"', _curly, text)
321
 
322
    return text
323
 
324
 
325
def extract_punctuation_bits(text: str) -> list[int]:
326
    """
327
    Read the punctuation-style fingerprint out of the text.
328
    Returns [b0, b1, b2] or fewer if signals absent.
329
    """
330
    bits: list[int] = []
331
 
332
    oxford = len(re.findall(r",\s+\w+,\s+(?:and|or)\s+", text))
333
    no_oxford = len(re.findall(r"\w,\s+\w+\s+(?:and|or)\s+", text))
334
    if oxford + no_oxford > 0:
335
        bits.append(1 if oxford > no_oxford else 0)
336
 
337
    em_count = text.count("\u2014")
338
    dh_count = len(re.findall(r"\w--\w| -- ", text))
339
    if em_count + dh_count > 0:
340
        bits.append(1 if em_count > dh_count else 0)
341
 
342
    curly = text.count("\u201c") + text.count("\u201d")
343
    straight = text.count('"')
344
    if curly + straight > 0:
345
        bits.append(1 if curly > straight else 0)
346
 
347
    return bits
348
 
349
 
350
 
351
SPELLING_VARIANTS = [
352
    ("color", "colour"),
353
    ("favor", "favour"),
354
    ("honor", "honour"),
355
    ("humor", "humour"),
356
    ("labor", "labour"),
357
    ("neighbor", "neighbour"),
358
    ("behavior", "behaviour"),
359
    ("organization", "organisation"),
360
    ("realize", "realise"),
361
    ("analyze", "analyse"),
362
    ("optimize", "optimise"),
363
    ("authorize", "authorise"),
364
    ("recognize", "recognise"),
365
    ("customize", "customise"),
366
    ("minimize", "minimise"),
367
    ("maximize", "maximise"),
368
    ("defense", "defence"),
369
    ("offense", "offence"),
370
    ("license", "licence"),
371
    ("catalog", "catalogue"),
372
    ("program", "programme"),
373
    ("center", "centre"),
374
    ("meter", "metre"),
375
    ("fiber", "fibre"),
376
    ("theater", "theatre"),
377
]
378
 
379
_SPELLING_LOOKUP: dict[str, tuple[int, int]] = {}
380
for _si, (_am, _br) in enumerate(SPELLING_VARIANTS):
381
    _SPELLING_LOOKUP[_am.lower()] = (_si, 0)
382
    _SPELLING_LOOKUP[_br.lower()] = (_si, 1)
383
 
384
 
385
def embed_spelling(text: str, mark_id: bytes) -> str:
386
    """Apply spelling variant marks keyed to mark_id bits."""
387
    for si, (american, british) in enumerate(SPELLING_VARIANTS):
388
        bit = _bit_for(mark_id, si + 8)
389
        target = british if bit else american
390
        other = american if bit else british
391
        pattern = re.compile(re.escape(other), re.IGNORECASE)
392
        text = pattern.sub(lambda m: _case_preserve(target, m.group()), text)
393
    return text
394
 
395
 
396
def extract_spelling_bits(text: str) -> list[tuple[int, int]]:
397
    """
398
    Extract spelling variant bits from text.
399
    Returns list of (variant_index, bit_value) tuples.
400
    """
401
    found = []
402
    for m in _WORD_RE.finditer(text):
403
        key = m.group(1).lower()
404
        if key in _SPELLING_LOOKUP:
405
            si, bit = _SPELLING_LOOKUP[key]
406
            found.append((si, bit))
407
    return found
408
 
409
 
410
 
411
CONTRACTIONS = [
412
    ("don't", "do not"),
413
    ("doesn't", "does not"),
414
    ("didn't", "did not"),
415
    ("won't", "will not"),
416
    ("wouldn't", "would not"),
417
    ("shouldn't", "should not"),
418
    ("couldn't", "could not"),
419
    ("isn't", "is not"),
420
    ("aren't", "are not"),
421
    ("wasn't", "was not"),
422
    ("weren't", "were not"),
423
    ("hasn't", "has not"),
424
    ("haven't", "have not"),
425
    ("hadn't", "had not"),
426
    ("can't", "cannot"),
427
    ("it's", "it is"),
428
    ("that's", "that is"),
429
    ("there's", "there is"),
430
    ("they're", "they are"),
431
    ("we're", "we are"),
432
    ("you're", "you are"),
433
    ("I'm", "I am"),
434
    ("he's", "he is"),
435
    ("she's", "she is"),
436
    ("we've", "we have"),
437
    ("they've", "they have"),
438
    ("I've", "I have"),
439
    ("you've", "you have"),
440
    ("we'll", "we will"),
441
    ("they'll", "they will"),
442
]
443
 
444
 
445
def embed_contractions(text: str, mark_id: bytes) -> str:
446
    """
447
    Expand or contract eligible contractions based on mark_id bits.
448
    Bit 0 = contracted form, Bit 1 = expanded form.
449
    """
450
    for ci, (contracted, expanded) in enumerate(CONTRACTIONS):
451
        bit = _bit_for(mark_id, ci + 40)
452
        if bit:
453
            pattern = re.compile(re.escape(contracted), re.IGNORECASE)
454
            text = pattern.sub(
455
                lambda m: _case_preserve(expanded, m.group()), text
456
            )
457
        else:
458
            pattern = re.compile(re.escape(expanded), re.IGNORECASE)
459
            text = pattern.sub(
460
                lambda m: _case_preserve(contracted, m.group()), text
461
            )
462
    return text
463
 
464
 
465
def extract_contraction_bits(text: str) -> list[tuple[int, int]]:
466
    """
467
    Detect which form (contracted vs expanded) appears in text.
468
    Returns list of (contraction_index, bit_value).
469
    """
470
    found = []
471
    text_lower = text.lower()
472
    for ci, (contracted, expanded) in enumerate(CONTRACTIONS):
473
        has_contracted = contracted.lower() in text_lower
474
        has_expanded = expanded.lower() in text_lower
475
        if has_contracted and not has_expanded:
476
            found.append((ci, 0))
477
        elif has_expanded and not has_contracted:
478
            found.append((ci, 1))
479
    return found
480
 
481
 
482
 
483
def embed_number_format(text: str, mark_id: bytes) -> str:
484
    """
485
    Apply number formatting choices keyed to mark_id.
486
    Bit 0: "1,000" vs "1000" (comma separator)
487
    Bit 1: "50%" vs "50 percent" / "50 per cent"
488
    """
489
    b0 = _bit_for(mark_id, 72)
490
    b1 = _bit_for(mark_id, 73)
491
 
492
    if b0:
493
        def _add_commas(m):
494
            n = m.group()
495
            if len(n) >= 4 and "," not in n:
496
                parts = []
497
                while len(n) > 3:
498
                    parts.append(n[-3:])
499
                    n = n[:-3]
500
                parts.append(n)
501
                return ",".join(reversed(parts))
502
            return m.group()
503
        text = re.sub(r"\b\d{4,}\b", _add_commas, text)
504
    else:
505
        text = re.sub(r"(\d),(\d{3})", r"\1\2", text)
506
 
507
    if b1:
508
        text = re.sub(r"(\d+)\s*%", r"\1 percent", text)
509
    else:
510
        text = re.sub(r"(\d+)\s+percent\b", r"\1%", text, flags=re.IGNORECASE)
511
 
512
    return text
513
 
514
 
515
 
516
def embed_synonyms_v2(text: str, mark_id: bytes, min_instances: int = 8) -> str:
517
    """
518
    Production v2 synonym embedding: uses the expanded ~150-class dictionary
519
    AND skips URLs, email addresses, file paths, and code blocks.
520
    """
521
    if not SYNONYMS_V2_AVAILABLE:
522
        return embed_synonyms(text, mark_id, min_instances)
523
 
524
    matches = list(iter_matchable_words(text))
525
    if len(matches) < min_instances:
526
        import sys
527
        print(
528
            f"[semantic v2] only {len(matches)} matchable words "
529
            f"(need {min_instances}); skipping L3",
530
            file=sys.stderr,
531
        )
532
        return text
533
 
534
    variants = _mark_id_to_variant_sequence(mark_id, len(matches), class_size=3)
535
 
536
    out: list[str] = []
537
    cursor = 0
538
    for (start, end, orig_word, (ci, _orig_vi, _pos)), target_vi in zip(matches, variants):
539
        cls_variants = _V2_CLASSES[ci].variants
540
        target_vi = target_vi % len(cls_variants)
541
        if " " in cls_variants[target_vi]:
542
            target_vi = (target_vi + 1) % len(cls_variants)
543
            if " " in cls_variants[target_vi]:
544
                target_vi = (target_vi + 1) % len(cls_variants)
545
        if " " in cls_variants[target_vi]:
546
            out.append(text[cursor:end])
547
            cursor = end
548
            continue
549
        replacement = _case_preserve(cls_variants[target_vi], orig_word)
550
        out.append(text[cursor:start])
551
        out.append(replacement)
552
        cursor = end
553
    out.append(text[cursor:])
554
    return "".join(out)
555
 
556
 
557
def verify_synonyms_v2(
558
    text: str, candidate_mark_id: bytes, threshold: float = 0.70
559
) -> tuple[bool, float]:
560
    """
561
    v2 verify: uses the expanded dictionary with URL/code skip.
562
    Returns (match, score).
563
    """
564
    if not SYNONYMS_V2_AVAILABLE:
565
        return verify_synonyms_match(text, candidate_mark_id, threshold)
566
 
567
    text = _strip_zw(text)
568
    actual = [(ci, vi) for (_s, _e, _w, (ci, vi, _pos)) in iter_matchable_words(text)]
569
    if not actual:
570
        return False, 0.0
571
 
572
    expected_variants = _mark_id_to_variant_sequence(candidate_mark_id, len(actual), 3)
573
    matches = 0
574
    counted = 0
575
    for (ci, actual_vi), expected_vi in zip(actual, expected_variants):
576
        cls_variants = _V2_CLASSES[ci].variants
577
        counted += 1
578
        exp_idx = expected_vi % len(cls_variants)
579
        if " " in cls_variants[exp_idx]:
580
            matches += 1
581
            continue
582
        if exp_idx == actual_vi:
583
            matches += 1
584
 
585
    score = matches / counted if counted else 0.0
586
    return (score >= threshold), score
587
 
588
 
589
def apply_semantic(text: str, mark_id: bytes, use_v2: bool = True) -> str:
590
    """
591
    Apply all L3 layers: synonyms + punctuation + spelling + contractions + numbers.
592
 
593
    This is the full semantic watermark embedding. Every mark type survives
594
    format conversion and invisible-character stripping.
595
    """
596
    if use_v2 and SYNONYMS_V2_AVAILABLE:
597
        t = embed_synonyms_v2(text, mark_id)
598
    else:
599
        t = embed_synonyms(text, mark_id)
600
    t = embed_punctuation(t, mark_id)
601
    t = embed_spelling(t, mark_id)
602
    t = embed_contractions(t, mark_id)
603
    t = embed_number_format(t, mark_id)
604
    return t
605
 
606
 
607
def verify_semantic(text: str, candidate_mark_id: bytes, use_v2: bool = True) -> dict:
608
    """
609
    Check whether text matches candidate_mark_id across all semantic sublayers.
610
    Returns per-sublayer scores and an overall match verdict.
611
    """
612
    if use_v2 and SYNONYMS_V2_AVAILABLE:
613
        syn_match, syn_score = verify_synonyms_v2(text, candidate_mark_id)
614
    else:
615
        syn_match, syn_score = verify_synonyms_match(text, candidate_mark_id)
616
 
617
    punct_bits = extract_punctuation_bits(text)
618
    expected_punct = [
619
        _bit_for(candidate_mark_id, 0),
620
        _bit_for(candidate_mark_id, 1),
621
        _bit_for(candidate_mark_id, 2),
622
    ]
623
    punct_hits = sum(1 for a, b in zip(punct_bits, expected_punct) if a == b)
624
    punct_total = len(punct_bits)
625
    punct_score = punct_hits / punct_total if punct_total else 0.0
626
 
627
    spelling_bits = extract_spelling_bits(text)
628
    spelling_hits = 0
629
    spelling_total = len(spelling_bits)
630
    for si, actual_bit in spelling_bits:
631
        expected_bit = _bit_for(candidate_mark_id, si + 8)
632
        if actual_bit == expected_bit:
633
            spelling_hits += 1
634
    spelling_score = spelling_hits / spelling_total if spelling_total else 0.0
635
 
636
    contraction_bits = extract_contraction_bits(text)
637
    contraction_hits = 0
638
    contraction_total = len(contraction_bits)
639
    for ci, actual_bit in contraction_bits:
640
        expected_bit = _bit_for(candidate_mark_id, ci + 40)
641
        if actual_bit == expected_bit:
642
            contraction_hits += 1
643
    contraction_score = (
644
        contraction_hits / contraction_total if contraction_total else 0.0
645
    )
646
 
647
    weights = {"syn": 0.50, "punct": 0.10, "spell": 0.20, "contract": 0.20}
648
    scores = {
649
        "syn": syn_score,
650
        "punct": punct_score,
651
        "spell": spelling_score,
652
        "contract": contraction_score,
653
    }
654
    active_weight = sum(
655
        w for k, w in weights.items()
656
        if (k == "syn" or (k == "punct" and punct_total)
657
            or (k == "spell" and spelling_total)
658
            or (k == "contract" and contraction_total))
659
    )
660
    if active_weight > 0:
661
        weighted_score = sum(
662
            scores[k] * weights[k] for k in weights
663
            if (k == "syn" or (k == "punct" and punct_total)
664
                or (k == "spell" and spelling_total)
665
                or (k == "contract" and contraction_total))
666
        ) / active_weight
667
    else:
668
        weighted_score = syn_score
669
 
670
    overall_match = weighted_score >= 0.65
671
 
672
    return {
673
        "synonyms_match": syn_match,
674
        "synonyms_score": syn_score,
675
        "punctuation_score": punct_score,
676
        "punctuation_hits": f"{punct_hits}/{punct_total}",
677
        "spelling_score": spelling_score,
678
        "spelling_hits": f"{spelling_hits}/{spelling_total}",
679
        "contraction_score": contraction_score,
680
        "contraction_hits": f"{contraction_hits}/{contraction_total}",
681
        "weighted_score": weighted_score,
682
        "overall_match": overall_match,
683
        "dict_version": "v2" if (use_v2 and SYNONYMS_V2_AVAILABLE) else "v1",
684
    }