tests/test_jcs_canonical_unit.py

124 lines · python

"""
test_jcs_canonical_unit
=======================
 
Byte-exact fixtures for the JSON Canonicalization Scheme (RFC 8785) port.
 
Background: the Rust reference uses ``serde_jcs::to_vec`` everywhere it
canonicalizes for signing or hashing. Python was historically on
``json.dumps(sort_keys=True, separators=(",",":")).encode("utf-8")``, which is
byte-identical to JCS for the ASCII-only subset but diverges for any non-ASCII
string value, because Python's default ``ensure_ascii=True`` escapes non-ASCII
as ``\\uXXXX`` while JCS emits raw UTF-8. That divergence was a latent threat
to the "bit-identical / conformance is ground truth" claim: any manifest,
tlog leaf, or evidence bundle containing a non-ASCII character would hash and
sign to different bytes across the two implementations.
 
These tests pin the JCS algorithm itself on known vectors (so a future
refactor cannot silently regress it), prove the non-ASCII divergence is
closed (the actual bug fix), and prove no regression for the existing
ASCII-only content (so committed fixtures and existing signatures stay valid).
"""
 
from __future__ import annotations
 
import json
import os
import sys
from pathlib import Path
 
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
 
from oversight_core.jcs import jcs_dumps
 
 
def test_primitives():
    assert jcs_dumps(None) == b"null"
    assert jcs_dumps(True) == b"true"
    assert jcs_dumps(False) == b"false"
    assert jcs_dumps(0) == b"0"
    assert jcs_dumps(42) == b"42"
    assert jcs_dumps(-1) == b"-1"
    assert jcs_dumps(9223372036854775807) == b"9223372036854775807"
    assert jcs_dumps("hello") == b'"hello"'
    assert jcs_dumps("") == b'""'
    assert jcs_dumps([]) == b"[]"
    assert jcs_dumps({}) == b"{}"
 
 
def test_key_sorting_nested():
    assert jcs_dumps({"b": 1, "a": 2}) == b'{"a":2,"b":1}'
    assert jcs_dumps({"z": 1, "a": {"y": 2, "x": 3}}) == b'{"a":{"x":3,"y":2},"z":1}'
    assert jcs_dumps([3, 1, 2]) == b"[3,1,2]"
 
 
def test_string_escapes():
    assert jcs_dumps('a"b') == b'"a\\"b"'
    assert jcs_dumps("a\\b") == b'"a\\\\b"'
    assert jcs_dumps("a\nb") == b'"a\\nb"'
    assert jcs_dumps("a\tb") == b'"a\\tb"'
    assert jcs_dumps("a\rb") == b'"a\\rb"'
    assert jcs_dumps("a\bb") == b'"a\\bb"'
    assert jcs_dumps("a\fb") == b'"a\\fb"'
    assert jcs_dumps("a\x01b") == b'"a\\u0001b"'
 
 
def test_non_ascii_emits_raw_utf8_not_uXXXX_escape():
    assert jcs_dumps({"name": "café"}) == b'{"name":"caf\xc3\xa9"}'
    assert jcs_dumps({"k": "日本"}) == b'{"k":"\xe6\x97\xa5\xe6\x9c\xac"}'
    assert jcs_dumps({"k": "𝄞"}) == b'{"k":"\xf0\x9d\x84\x9e"}'
 
 
def test_non_ascii_key_sort_order():
    out = jcs_dumps({"ñ": 3, "z": 2, "abc": 1})
    assert out == b'{"abc":1,"z":2,"\xc3\xb1":3}'
 
 
def test_floats_rejected():
    try:
        jcs_dumps(1.0)
        raise AssertionError("jcs_dumps accepted a float")
    except TypeError:
        pass
    try:
        jcs_dumps({"x": 1.5})
        raise AssertionError("jcs_dumps accepted a nested float")
    except TypeError:
        pass
 
 
def test_unsupported_types_rejected():
    for bad in (object(), b"bytes", set(), frozenset()):
        try:
            jcs_dumps(bad)
            raise AssertionError(f"jcs_dumps accepted {type(bad).__name__}")
        except TypeError:
            pass
 
 
def test_ascii_content_byte_identical_to_legacy_sort_keys():
    samples = [
        {"event": "register", "file_id": "f0", "n": 3},
        {"a": ["x", "y"], "b": {"c": True, "d": None}},
        {"size": 7, "root": "00" * 32, "signature": "ab" * 64},
    ]
    for s in samples:
        legacy = json.dumps(s, sort_keys=True, separators=(",", ":")).encode("utf-8")
        assert jcs_dumps(s) == legacy, (
            f"ASCII divergence!\n  legacy: {legacy!r}\n  jcs:    {jcs_dumps(s)!r}"
        )
 
 
def test_tuple_serializes_like_list():
    assert jcs_dumps((1, 2, 3)) == b"[1,2,3]"
 
 
def test_round_trip_through_json_parser():
    cases = [
        {"a": 1, "b": [True, None, "x"], "c": {"d": "café"}},
        {"issuer": "Zión@test", "hash": "ab" * 16},
    ]
    for c in cases:
        rt = json.loads(jcs_dumps(c).decode("utf-8"))
        assert rt == c

1	"""
2	test_jcs_canonical_unit
3	=======================
4
5	Byte-exact fixtures for the JSON Canonicalization Scheme (RFC 8785) port.
6
7	Background: the Rust reference uses ``serde_jcs::to_vec`` everywhere it
8	canonicalizes for signing or hashing. Python was historically on
9	``json.dumps(sort_keys=True, separators=(",",":")).encode("utf-8")``, which is
10	byte-identical to JCS for the ASCII-only subset but diverges for any non-ASCII
11	string value, because Python's default ``ensure_ascii=True`` escapes non-ASCII
12	as ``\\uXXXX`` while JCS emits raw UTF-8. That divergence was a latent threat
13	to the "bit-identical / conformance is ground truth" claim: any manifest,
14	tlog leaf, or evidence bundle containing a non-ASCII character would hash and
15	sign to different bytes across the two implementations.
16
17	These tests pin the JCS algorithm itself on known vectors (so a future
18	refactor cannot silently regress it), prove the non-ASCII divergence is
19	closed (the actual bug fix), and prove no regression for the existing
20	ASCII-only content (so committed fixtures and existing signatures stay valid).
21	"""
22
23	from __future__ import annotations
24
25	import json
26	import os
27	import sys
28	from pathlib import Path
29
30	ROOT = Path(__file__).resolve().parent.parent
31	sys.path.insert(0, str(ROOT))
32
33	from oversight_core.jcs import jcs_dumps
34
35
36	def test_primitives():
37	assert jcs_dumps(None) == b"null"
38	assert jcs_dumps(True) == b"true"
39	assert jcs_dumps(False) == b"false"
40	assert jcs_dumps(0) == b"0"
41	assert jcs_dumps(42) == b"42"
42	assert jcs_dumps(-1) == b"-1"
43	assert jcs_dumps(9223372036854775807) == b"9223372036854775807"
44	assert jcs_dumps("hello") == b'"hello"'
45	assert jcs_dumps("") == b'""'
46	assert jcs_dumps([]) == b"[]"
47	assert jcs_dumps({}) == b"{}"
48
49
50	def test_key_sorting_nested():
51	assert jcs_dumps({"b": 1, "a": 2}) == b'{"a":2,"b":1}'
52	assert jcs_dumps({"z": 1, "a": {"y": 2, "x": 3}}) == b'{"a":{"x":3,"y":2},"z":1}'
53	assert jcs_dumps([3, 1, 2]) == b"[3,1,2]"
54
55
56	def test_string_escapes():
57	assert jcs_dumps('a"b') == b'"a\\"b"'
58	assert jcs_dumps("a\\b") == b'"a\\\\b"'
59	assert jcs_dumps("a\nb") == b'"a\\nb"'
60	assert jcs_dumps("a\tb") == b'"a\\tb"'
61	assert jcs_dumps("a\rb") == b'"a\\rb"'
62	assert jcs_dumps("a\bb") == b'"a\\bb"'
63	assert jcs_dumps("a\fb") == b'"a\\fb"'
64	assert jcs_dumps("a\x01b") == b'"a\\u0001b"'
65
66
67	def test_non_ascii_emits_raw_utf8_not_uXXXX_escape():
68	assert jcs_dumps({"name": "café"}) == b'{"name":"caf\xc3\xa9"}'
69	assert jcs_dumps({"k": "日本"}) == b'{"k":"\xe6\x97\xa5\xe6\x9c\xac"}'
70	assert jcs_dumps({"k": "𝄞"}) == b'{"k":"\xf0\x9d\x84\x9e"}'
71
72
73	def test_non_ascii_key_sort_order():
74	out = jcs_dumps({"ñ": 3, "z": 2, "abc": 1})
75	assert out == b'{"abc":1,"z":2,"\xc3\xb1":3}'
76
77
78	def test_floats_rejected():
79	try:
80	jcs_dumps(1.0)
81	raise AssertionError("jcs_dumps accepted a float")
82	except TypeError:
83	pass
84	try:
85	jcs_dumps({"x": 1.5})
86	raise AssertionError("jcs_dumps accepted a nested float")
87	except TypeError:
88	pass
89
90
91	def test_unsupported_types_rejected():
92	for bad in (object(), b"bytes", set(), frozenset()):
93	try:
94	jcs_dumps(bad)
95	raise AssertionError(f"jcs_dumps accepted {type(bad).__name__}")
96	except TypeError:
97	pass
98
99
100	def test_ascii_content_byte_identical_to_legacy_sort_keys():
101	samples = [
102	{"event": "register", "file_id": "f0", "n": 3},
103	{"a": ["x", "y"], "b": {"c": True, "d": None}},
104	{"size": 7, "root": "00" * 32, "signature": "ab" * 64},
105	]
106	for s in samples:
107	legacy = json.dumps(s, sort_keys=True, separators=(",", ":")).encode("utf-8")
108	assert jcs_dumps(s) == legacy, (
109	f"ASCII divergence!\n legacy: {legacy!r}\n jcs: {jcs_dumps(s)!r}"
110	)
111
112
113	def test_tuple_serializes_like_list():
114	assert jcs_dumps((1, 2, 3)) == b"[1,2,3]"
115
116
117	def test_round_trip_through_json_parser():
118	cases = [
119	{"a": 1, "b": [True, None, "x"], "c": {"d": "café"}},
120	{"issuer": "Zión@test", "hash": "ab" * 16},
121	]
122	for c in cases:
123	rt = json.loads(jcs_dumps(c).decode("utf-8"))
124	assert rt == c