81d5699 · TreeTrace

Labeling accuracy: derived confidence, causal ordering, multi-label, FP guards, fail-closed redaction

Implements the approved labeling-accuracy proposal (P1-P7, embeddings deferred).

P7: secret-assignment redaction now fails closed on escaped JSON string values
whose escape-inflated length falls under the generic floor (e.g. {"api_key":"a\nz"});
adds a companion rule and end-to-end + unit fail-closed tests.

P1: security confidence/tier are derived from independent corroborating signals
(scoreSecurity) instead of constant 0.95/0.84 buckets; every contributing signal
is listed in the evidence text. A single strong signal still anchors at verified/0.95.

P2: afterFailure falls back to ingestion ordinal (node id order) instead of returning
true when timestamps are missing, so a corrector can never precede its failure.
Resolution returns honest null unless it shares evidence or is an explicit acceptance
turn, instead of guessing the temporally-nearest node.

P3: securityActions and inferSignals return all matching kinds (capped) instead of
first-match-wins, so multi-class events surface every label.

P4: weak keywords (bare rbac/access-control) require a co-signal to tier above inferred;
adds a checked-in negative corpus as a release gate (zero security/failure/redaction FPs).

P5: extensionless dot-directory path detection was already present; covered by an
existing test, no change needed.

P6: human security-correction backstop emits an inferred-only signal anchored to a
prior unlabeled action, never fabricating strong/verified labels.

Hard rules preserved: zero runtime deps, no LLM judge, no network/telemetry,
labels remain auditable (evidence + node ids).

81d5699 Zion Boggan committed on Jun 15, 2026 (1 week ago)

src/analyze.js +224 -39

		@@ -50,6 +50,7 @@ const STOPWORDS = new Set([
		'when', 'where', 'which', 'will', 'about', 'agent', 'make', 'made', 'show', 'look',
		]);

	+	const PROCESS_LABEL_CAP = 2;
		const CONSTRAINT_PER_NODE_CAP = 3;
		const CONSTRAINT_LIST_CAP = 10;
		const CONSTRAINT_CLAUSE_MAX = 160;
		@@ -106,6 +107,15 @@ const TEST_SKIP_API_RE =
		const TEST_SKIP_RE =
		/\b(?:disabl\|skip\|remov\|delet\|comment(?:ed)? out\|drop\|turn(?:ed)? off\|x?(?:it\|describe)\.skip\|--no-tests?\|--skip-tests?)\w\b[^.\n]{0,24}\btests?\b\|\btests?\b[^.\n]{0,24}\b(?:disabl\|skip\|remov\|delet\|comment(?:ed)? out\|turn(?:ed)? off)\w/i;

	+	// P6: strong human security-correction phrasing. Used as a corroborating co-signal and as
	+	// the inferred-tier recall backstop (must never mint a strong/verified label by itself).
	+	const SECURITY_CORRECTION_RE =
	+	/\b(?:don'?t\|do not\|never)\b[^.]{0,30}\b(?:leak\|expose\|commit\|hardcode\|hard[- ]?code\|push\|publish)\b[^.]{0,30}\b(?:secret\|secrets\|token\|tokens\|key\|keys\|credential\|credentials\|password\|passwords\|env\|api)\b\|\b(?:rotate\|revoke\|regenerate\|invalidate)\b[^.]{0,25}\b(?:that\|the\|this\|those\|your\|my)?\s(?:secret\|token\|key\|credential\|password\|pat\|api[- ]?key\|access token)\b\|\bthat'?s? (?:a\|the\|my\|our) (?:secret\|credential\|api[- ]?key\|token\|password)\b\|\b(?:revert\|undo\|roll ?back)\b[^.]{0,25}\b(?:the\|that\|those)?\s(?:auth\|security\|permission\|access[- ]?control\|rbac\|credential)\b\|\b(?:you\|it)\b[^.]{0,20}\b(?:leaked\|exposed\|hardcoded\|hard[- ]?coded\|committed)\b[^.]{0,25}\b(?:secret\|token\|key\|credential\|password\|env)\b/i;
	+
	+	function hasSecurityCorrection(text) {
	+	return typeof text === 'string' && text.length <= 4000 && SECURITY_CORRECTION_RE.test(text);
	+	}
	+
		export function classifySecuritySurface(file) {
		if (!file) return null;
		for (const rule of SECURITY_SURFACE_RULES) {
		@@ -126,31 +136,87 @@ export function mentionsTestSkip(text) {
		);
		}

	+	// P3: return ALL matching kinds per action instead of first-match-wins, so a node that
	+	// is both a credential leak and a disabled-test (etc.) surfaces every class. Each kind
	+	// carries its own strong/weak flag and the body that triggered it (for the audit trail).
	+	// `weak` marks a lone keyword (bare rbac/access-control) that needs a co-signal (P4).
		function securityActions(node) {
		const out = [];
		for (const a of node.actions \|\| []) {
		const body = `${a.command \|\| ''} ${a.input \|\| ''}`;
	-	let kind = null;
	-	let strong = false;
	-	if (SECRET_CONTENT_RE.test(body)) {
	-	kind = 'credential';
	-	strong = true;
	-	} else if (a.file && isCredentialFile(a.file)) {
	-	kind = 'file';
	-	strong = true;
	-	} else if (ACCESS_CONTROL_CONTENT_RE.test(body)) {
	-	kind = 'access-control';
	-	strong = true;
	-	} else if (a.command && RISKY_CMD_RE.test(a.command)) {
	-	kind = 'risky-command';
	-	} else if (ACCESS_CONTROL_WEAK_RE.test(body)) {
	-	kind = 'access-control';
	+	const kinds = [];
	+	if (SECRET_CONTENT_RE.test(body)) kinds.push({ kind: 'credential', strong: true });
	+	if (a.file && isCredentialFile(a.file)) kinds.push({ kind: 'file', strong: true });
	+	if (ACCESS_CONTROL_CONTENT_RE.test(body)) kinds.push({ kind: 'access-control', strong: true });
	+	if (a.command && RISKY_CMD_RE.test(a.command)) kinds.push({ kind: 'risky-command', strong: false });
	+	// Weak keyword: only counts when no strong access-control content already fired on this action.
	+	if (ACCESS_CONTROL_WEAK_RE.test(body) && !kinds.some((k) => k.kind === 'access-control')) {
	+	kinds.push({ kind: 'access-control', strong: false, weak: true });
		}
	-	if (kind) out.push({ action: a, kind, strong });
	+	for (const k of kinds) out.push({ action: a, ...k });
		}
		return out;
		}

	+	// Anchor confidences kept stable so existing tiers/numbers do not regress:
	+	// one strong signal -> verified / 0.95 (unchanged anchor the suite asserts on)
	+	// weak-only + cosignal-> high / 0.84
	+	// inferred backstops -> 0.62-0.70
	+	const SECURITY_STRONG_BASE = 0.95;
	+	const SECURITY_WEAK_BASE = 0.84;
	+
	+	// P1: derive a security signal's confidence and tier from how many INDEPENDENT signals
	+	// corroborate it, instead of a constant two-bucket value. Each contributing signal is
	+	// listed in the evidence text (with node ids upstream) so the verdict stays auditable.
	+	// P4: a lone weak keyword (bare rbac/access-control) scores low and lands `inferred`
	+	// unless a real co-signal (credential content, security surface file, or human security
	+	// correction) is present.
	+	function scoreSecurity({ secActs, surface, humanCorrection }) {
	+	const signals = [];
	+	const strongActs = secActs.filter((s) => s.strong);
	+	const weakActs = secActs.filter((s) => !s.strong);
	+	const hasStrong = strongActs.length > 0;
	+	const hasWeakKeywordOnly = !hasStrong && secActs.some((s) => s.weak);
	+
	+	if (strongActs.some((s) => s.kind === 'credential')) signals.push('strong credential content');
	+	if (strongActs.some((s) => s.kind === 'file')) signals.push('credential filename');
	+	if (strongActs.some((s) => s.kind === 'access-control')) signals.push('access-control command');
	+	if (weakActs.some((s) => s.kind === 'risky-command')) signals.push('risky command');
	+	if (weakActs.some((s) => s.weak)) signals.push('access-control keyword');
	+	if (surface) signals.push(`security surface (${surface})`);
	+	if (humanCorrection) signals.push('human security correction');
	+
	+	// Independent corroboration count beyond the primary signal nudges confidence within band.
	+	const corroboration = Math.max(0, signals.length - 1);
	+
	+	let tier;
	+	let base;
	+	if (hasStrong) {
	+	tier = 'verified';
	+	base = SECURITY_STRONG_BASE;
	+	} else if (hasWeakKeywordOnly) {
	+	// P4 co-signal gate: a bare keyword with a real co-signal earns `high`; alone it stays `inferred`.
	+	const cosignal = Boolean(surface) \|\| humanCorrection \|\| weakActs.some((s) => s.kind === 'risky-command');
	+	if (cosignal) {
	+	tier = 'high';
	+	base = SECURITY_WEAK_BASE;
	+	} else {
	+	tier = 'inferred';
	+	base = 0.62;
	+	}
	+	} else {
	+	// risky-command (no keyword) or surface-only corroboration
	+	tier = 'high';
	+	base = SECURITY_WEAK_BASE;
	+	}
	+
	+	// Within-band lift from extra corroboration, clamped to the band ceiling so the
	+	// verified anchor (0.95) and existing assertions never move.
	+	const ceiling = tier === 'verified' ? 0.95 : tier === 'high' ? 0.9 : 0.7;
	+	const confidence = Math.min(ceiling, Math.round((base + 0.02 * corroboration) * 100) / 100);
	+	return { tier, confidence, signals };
	+	}
	+
		function fileHint(node) {
		for (const a of node.actions \|\| []) {
		if (a.file) return a.file;
		@@ -306,14 +372,18 @@ export function analyzeTree(tree) {
		return failure;
		};

	+	const securityNodeIds = new Set();
		tree.nodes.forEach((node, index) => {
		const secActs = securityActions(node);
		if (secActs.length) {
	-	const hasStrong = secActs.some((s) => s.strong);
	-	const tier = hasStrong ? 'verified' : 'high';
	-	const confidence = hasStrong ? 0.95 : 0.84;
	+	// P1: corroborating co-signals -- surface class on a touched file, and a human
	+	// security correction that points back at this node -- feed the derived score.
	+	const surface = uniq((node.actions \|\| []).map((a) => classifySecuritySurface(a.file))).filter(Boolean)[0] \|\| null;
	+	const humanCorrection =
	+	node.kind !== 'correction' ? Boolean(nearestSecurityCorrection(tree.nodes, node)) : false;
	+	const { tier, confidence, signals } = scoreSecurity({ secActs, surface, humanCorrection });
		const targets = uniq(secActs.map((s) => s.action.file \|\| s.action.command \|\| s.action.input)).slice(0, 3);
	-	const kinds = uniq(secActs.map((s) => s.kind));
	+	const kinds = uniq(secActs.map((s) => s.kind)); // P3: every matching class, not first-match-wins
		addFailure({
		type: 'security_or_privacy_risk',
		confidence,
		@@ -321,9 +391,10 @@ export function analyzeTree(tree) {
		failureNode: node,
		correctionNode: node.kind === 'correction' ? null : nearestCorrectionAfter(tree.nodes, node),
		resolvedNode: nearestAcceptedAfter(tree.nodes, node, null),
	-	evidence: `Agent action touched ${kinds.join(', ')}: ${targets.map((t) => `"${truncate(String(t), 80)}"`).join(', ')}`,
	+	evidence: `Agent action touched ${kinds.join(', ')} [signals: ${signals.join('; ')}]: ${targets.map((t) => `"${truncate(String(t), 80)}"`).join(', ')}`,
		summary: `An agent action touched auth, secrets, or access control near "${truncate(node.title, 90)}".`,
		});
	+	securityNodeIds.add(node.id);
		} else if (node.text.length <= 1200 && SECURITY_INTENT_RE.test(node.text)) {
		addFailure({
		type: 'security_or_privacy_risk',
		@@ -335,6 +406,30 @@ export function analyzeTree(tree) {
		evidence: `User stated a security-sensitive intent: "${quote(node.text)}"`,
		summary: `A security-sensitive intent was stated near "${truncate(node.title, 90)}".`,
		});
	+	securityNodeIds.add(node.id);
	+	}
	+
	+	// P6: human-correction security-recall backstop. A human turn with a strong security
	+	// correction ("don't leak that", "rotate that key", "revert the auth change") whose
	+	// corrected (prior) node carried NO security label catches a real security event whose
	+	// action phrasing missed the keyword list. Strictly `inferred` and human-grounded -- it
	+	// never fabricates a strong/verified label.
	+	if (hasSecurityCorrection(node.text)) {
	+	const prior = nearestFailureTarget(node, tree.nodes);
	+	const anchor = prior ? prior.target : null;
	+	if (anchor && !securityNodeIds.has(anchor.id) && anchor.id !== node.id) {
	+	addFailure({
	+	type: 'security_or_privacy_risk',
	+	confidence: 0.62,
	+	tier: 'inferred',
	+	failureNode: anchor,
	+	correctionNode: node,
	+	resolvedNode: nearestAcceptedAfter(tree.nodes, anchor, node),
	+	evidence: `Human flagged a security concern about a prior action with no security label [signal: human security correction]: "${quote(node.text)}"`,
	+	summary: `A human security correction was raised near "${truncate(anchor.title, 90)}" with no matching action-level signal.`,
	+	});
	+	securityNodeIds.add(anchor.id);
	+	}
		}

		if (node.status === 'abandoned') {
		@@ -681,10 +776,18 @@ function inferSignals(node) {
		if (!matched.size && node.kind === 'correction') consider('misunderstood_goal', 0.62);

		if (!matched.size) return [];
	+	// P3: return all matching process kinds in priority order (capped) instead of
	+	// first-match-wins, so a node that is e.g. both scope_drift and ignored_constraint
	+	// surfaces both. misunderstood_goal is a fallback-only label and never co-emits.
	+	const out = [];
		for (const type of SIGNAL_PRIORITY) {
	-	if (matched.has(type)) return [{ type, confidence: matched.get(type) }];
	+	if (type === 'misunderstood_goal') continue;
	+	if (matched.has(type)) out.push({ type, confidence: matched.get(type) });
		}
	-	return [];
	+	if (!out.length && matched.has('misunderstood_goal')) {
	+	return [{ type: 'misunderstood_goal', confidence: matched.get('misunderstood_goal') }];
	+	}
	+	return out.slice(0, PROCESS_LABEL_CAP);
		}

		function tsOf(node) {
		@@ -692,11 +795,29 @@ function tsOf(node) {
		return Number.isFinite(t) ? t : null;
		}

	+	// Ingestion ordinal: node ids are assigned in stream order as `node_NNN` (src/tree.js),
	+	// so the numeric suffix is a stable parse-time ordinal. This is the causality tiebreak
	+	// used when timestamps are missing, instead of optimistically returning true (STRUCT-1).
	+	function ordinalOf(node) {
	+	if (!node) return null;
	+	if (Number.isFinite(node._ord)) return node._ord;
	+	const m = /(\d+)\s*$/.exec(String(node.id \|\| ''));
	+	return m ? Number(m[1]) : null;
	+	}
	+
	+	// P2: when timestamps are present, enforce ts ordering. When either timestamp is
	+	// missing, fall back to ingestion-ordinal ordering rather than returning true, so
	+	// timestamp-less adapters still get a real causal ordering and a corrector can never
	+	// be linked to a failure it preceded in the stream.
		function afterFailure(candidate, failureNode) {
		const ct = tsOf(candidate);
		const ft = tsOf(failureNode);
	-	if (ct === null \|\| ft === null) return true;
	-	return ct >= ft;
	+	if (ct !== null && ft !== null) return ct >= ft;
	+	const co = ordinalOf(candidate);
	+	const fo = ordinalOf(failureNode);
	+	if (co !== null && fo !== null) return co >= fo;
	+	// No timestamp and no ordinal on either side: cannot establish ordering -> fail closed.
	+	return false;
		}

		function actionFiles(node) {
		@@ -712,8 +833,17 @@ function sharedFiles(a, b) {

		function tokenSet(node) {
		const out = new Set();
	-	for (const raw of String(node.text \|\| '').toLowerCase().match(/[a-z][a-z0-9_-]{2,}/g) \|\| []) {
	-	if (!STOPWORDS.has(raw)) out.add(raw);
	+	const harvest = (s) => {
	+	for (const raw of String(s \|\| '').toLowerCase().match(/[a-z][a-z0-9_-]{2,}/g) \|\| []) {
	+	if (!STOPWORDS.has(raw)) out.add(raw);
	+	}
	+	};
	+	harvest(node.text);
	+	// Include path tokens from this node's action files so a correction that names the
	+	// touched surface ("the auth flow") ties back to an edit of `src/auth/session.ts`.
	+	// This strengthens semantic linkage (STRUCT-3) without temporal guessing.
	+	for (const a of node.actions \|\| []) {
	+	if (a.file) harvest(String(a.file).replace(/[\\/.+_-]+/g, ' '));
		}
		return out;
		}
		@@ -727,8 +857,25 @@ function tokenOverlap(a, b) {
		return hits;
		}

	+	// Distinctive surface tokens: a single shared one between a security-file edit and a
	+	// correction is a strong semantic tie (e.g. an `auth/session.ts` edit + "fix the auth flow"),
	+	// where generic token overlap >= 3 would miss the link.
	+	const SURFACE_TOKENS = new Set([
	+	'auth', 'session', 'login', 'signin', 'signup', 'oauth', 'jwt', 'sso', 'saml',
	+	'secret', 'secrets', 'credential', 'credentials', 'password', 'token', 'apikey',
	+	'rbac', 'permission', 'permissions', 'middleware', 'crypto', 'encrypt', 'decrypt',
	+	]);
	+
	+	function sharedSurfaceToken(a, b) {
	+	const ta = tokenSet(a);
	+	const tb = tokenSet(b);
	+	for (const t of ta) if (SURFACE_TOKENS.has(t) && tb.has(t)) return true;
	+	return false;
	+	}
	+
		function sharesEvidence(failureNode, candidate) {
		if (sharedFiles(failureNode, candidate)) return true;
	+	if (sharedSurfaceToken(failureNode, candidate)) return true;
		return tokenOverlap(failureNode, candidate) >= 3;
		}

		@@ -737,7 +884,7 @@ function nearestFailureTarget(node, nodes) {
		(n) => n.status !== 'abandoned' && n.id !== node.id && afterFailure(node, n)
		);
		if (!earlier.length) return null;
	-	earlier.sort((a, b) => (tsOf(b) ?? 0) - (tsOf(a) ?? 0));
	+	earlier.sort((a, b) => orderAfter(b, a));
		const semantic = earlier.find((n) => sharesEvidence(n, node));
		if (semantic) return { target: semantic, linkage: 'semantic' };
		if (node.parent && node.parent.status !== 'abandoned' && node.parent.id !== node.id && afterFailure(node, node.parent)) {
		@@ -746,25 +893,63 @@ function nearestFailureTarget(node, nodes) {
		return { target: earlier[0], linkage: 'positional' };
		}

	+	// Acceptance/confirmation cue: an explicit "looks good / that works / fixed" turn is a
	+	// semantic resolution even when it shares no tokens or files with the failure.
	+	const ACCEPTANCE_RE =
	+	/\b(?:that(?:'?s\| is\| works\| fixed)\|works now\|looks? good\|lgtm\|perfect\|great\|nice\|fixed\|resolved\|that did it\|that worked\|much better\|exactly\|correct now)\b/i;
	+
	+	function laterCandidates(nodes, failureNode, anchor, extraExcludeId) {
	+	return nodes
	+	.filter((n) => n.status !== 'abandoned' && n.id !== failureNode.id && afterFailure(n, anchor))
	+	.filter((n) => !extraExcludeId \|\| n.id !== extraExcludeId)
	+	.sort(orderAfter);
	+	}
	+
	+	function orderAfter(a, b) {
	+	const ta = tsOf(a);
	+	const tb = tsOf(b);
	+	if (ta !== null && tb !== null) return ta - tb;
	+	return (ordinalOf(a) ?? Infinity) - (ordinalOf(b) ?? Infinity);
	+	}
	+
	+	// P2: only return a resolution when it actually ties back to the failure -- it shares
	+	// evidence (file or token overlap) OR it is an explicit acceptance/confirmation turn.
	+	// Otherwise return null. An honest null beats the temporally-nearest node, which is
	+	// frequently just "the next thing that happened" and poisons eval candidates.
		function nearestAcceptedAfter(nodes, failureNode, correctionNode) {
		const anchor = correctionNode \|\| failureNode;
	-	const later = nodes
	-	.filter((n) => n.status !== 'abandoned' && n.id !== failureNode.id && afterFailure(n, anchor))
	-	.filter((n) => !correctionNode \|\| n.id !== correctionNode.id);
	+	const later = laterCandidates(nodes, failureNode, anchor, correctionNode?.id);
		if (!later.length) return null;
	-	later.sort((a, b) => (tsOf(a) ?? Infinity) - (tsOf(b) ?? Infinity));
		const semantic = later.find((n) => sharesEvidence(failureNode, n));
	-	return semantic \|\| later[0];
	+	if (semantic) return semantic;
	+	const accepted = later.find((n) => ACCEPTANCE_RE.test(String(n.text \|\| '')));
	+	return accepted \|\| null;
		}

	+	// P2: only treat a later correction as the corrector when it semantically ties back to
	+	// the failure (shared evidence). A correction that merely happened later, about something
	+	// else, is not the corrector -- return null and let the signal stand uncorrected.
		function nearestCorrectionAfter(nodes, failureNode) {
	-	const later = nodes.filter(
	-	(n) => n.status !== 'abandoned' && n.kind === 'correction' && n.id !== failureNode.id && afterFailure(n, failureNode)
	-	);
	+	const later = nodes
	+	.filter((n) => n.status !== 'abandoned' && n.kind === 'correction' && n.id !== failureNode.id && afterFailure(n, failureNode))
	+	.sort(orderAfter);
		if (!later.length) return null;
	-	later.sort((a, b) => (tsOf(a) ?? Infinity) - (tsOf(b) ?? Infinity));
	-	const semantic = later.find((n) => sharesEvidence(failureNode, n));
	-	return semantic \|\| later[0];
	+	return later.find((n) => sharesEvidence(failureNode, n)) \|\| null;
	+	}
	+
	+	// Co-signal lookup for P1: a later human turn that both carries security-correction
	+	// phrasing and ties back to this node by shared evidence corroborates the signal.
	+	function nearestSecurityCorrection(nodes, failureNode) {
	+	const later = nodes
	+	.filter(
	+	(n) =>
	+	n.status !== 'abandoned' &&
	+	n.id !== failureNode.id &&
	+	afterFailure(n, failureNode) &&
	+	hasSecurityCorrection(n.text)
	+	)
	+	.sort(orderAfter);
	+	return later.find((n) => sharesEvidence(failureNode, n)) \|\| null;
		}

		function tierRank(tier) {

src/redact.js +5 -0

		@@ -26,6 +26,11 @@ export const RULES = [
		{ id: 'url-basic-auth', severity: 'medium', re: /\b[a-z][a-z0-9+.-]{0,30}:\/\/[^/\s:@'"`]{2,256}:[^/\s@'"`]{2,256}@[^\s'"`]{1,512}/gi },
		{ id: 'bearer-header', severity: 'medium', re: /\bBearer\s+[A-Za-z0-9._+/=-]{20,}\b/g },
		{ id: 'secret-assignment', severity: 'medium', re: /["'`]?\b(password\|passwd\|pwd\|secret\|api[_-]?key\|access[_-]?token\|auth[_-]?token\|client[_-]?secret\|secret[_-]?key\|token\|bearer)\b["'`]?\s[:=]\s(?!(?:["'`]?\s)?(?:\$\{\|\$\(\|<\|%\|\{3}\|\.{3}\|REDACTED\|\[REDACTED\|xxx+\|placeholder\|changeme\|example\|your[-_]\|null\b\|true\b\|false\b))(?:"(?:[^"\\]\|\\.){4,512}"\|'(?:[^'\\]\|\\.){4,512}'\|`(?:[^`\\]\|\\.){4,512}`\|[^\s'"`,;){}]{6,512})/gi },
	+	// Fail-closed companion: a secret-key assignment whose quoted value contains ANY backslash escape
	+	// is redacted even when the escape-inflated character count falls under the generic floor above.
	+	// Escaped JSON string values (literal \n, \t, \", \\) are the common serialized form of a secret;
	+	// counting an escape as two characters must never let a short escaped value slip the gate.
	+	{ id: 'secret-assignment', severity: 'medium', re: /["'`]?\b(password\|passwd\|pwd\|secret\|api[_-]?key\|access[_-]?token\|auth[_-]?token\|client[_-]?secret\|secret[_-]?key\|token\|bearer)\b["'`]?\s[:=]\s(?!(?:["'`]?\s)?(?:\$\{\|\$\(\|<\|%\|\{3}\|\.{3}\|REDACTED\|\[REDACTED\|xxx+\|placeholder\|changeme\|example\|your[-_]\|null\b\|true\b\|false\b))(?:"(?:[^"\\]\|\\.)?\\.(?:[^"\\]\|\\.)?"\|'(?:[^'\\]\|\\.)?\\.(?:[^'\\]\|\\.)?'\|`(?:[^`\\]\|\\.)?\\.(?:[^`\\]\|\\.)?`)/gi },

		{ id: 'email', severity: 'soft', re: /\b[A-Za-z0-9._%+-]+@(?!(?:users\.noreply\.github\.com\|example\.(?:com\|org)))[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g },
		{ id: 'ipv4', severity: 'soft', re: /\b(?:(?:25[0-5]\|2[0-4]\d\|1\d\d\|[1-9]?\d)\.){3}(?:25[0-5]\|2[0-4]\d\|1\d\d\|[1-9]?\d)\b(?!\.\d)/g },

test/treetrace.test.js +263 -0

		@@ -1360,3 +1360,266 @@ test('cli: --stdin --from claude is rejected', () => {
		assert.throws(() => parseArgs(['--stdin', '--from', 'claude']), /cannot be combined with --from claude/);
		});

	+
	+	// ---------------------------------------------------------------------------
	+	// Labeling-accuracy fixes (proposal P1-P7) + negative-corpus release gate.
	+	// ---------------------------------------------------------------------------
	+
	+	test('P7: short escaped-JSON secret values fail closed (redaction gate)', () => {
	+	// Escape-inflated character counts must never let a short escaped value slip the floor.
	+	const cases = [
	+	['short escaped newline', '{"api_key":"a\\nz"}'],
	+	['tiny escaped value', '{"api_key":"x\\ny"}'],
	+	['escaped quote', '{"token":"a\\"b"}'],
	+	['escaped backslash', '{"secret":"a\\\\b"}'],
	+	['spec literal-\\n form', '{"api_key":"line1\\nline2line2line2"}'],
	+	];
	+	for (const [label, sample] of cases) {
	+	const hits = scanText(sample).map((f) => f.ruleId);
	+	assert.ok(hits.includes('secret-assignment'), `${label}: escaped secret must be caught (got ${JSON.stringify(hits)})`);
	+	}
	+	// Must not over-fire on benign short non-escaped values or placeholders.
	+	assert.equal(scanText('{"api_key":"ab"}').length, 0, 'benign short value below floor must stay clean');
	+	assert.equal(scanText('{"api_key":"${SECRET}"}').filter((f) => f.ruleId === 'secret-assignment').length, 0, 'placeholder must stay clean');
	+	});
	+
	+	test('P7: a short escaped-JSON secret leaves no raw value in any artifact end to end', async () => {
	+	const rawValue = 'a\\nz';
	+	const secretLine = `config is {"api_key":"${rawValue}"}`;
	+	const dir = mkdtempSync(join(tmpdir(), 'treetrace-p7-'));
	+	const file = join(dir, 'escconv.json');
	+	const convo = [{
	+	mapping: {
	+	r: { message: null, parent: null, children: ['u'] },
	+	u: { message: { author: { role: 'user' }, content: { parts: [secretLine] }, create_time: 1.0 }, parent: 'r', children: ['a'] },
	+	a: { message: { author: { role: 'assistant' }, content: { parts: ['ok'] }, create_time: 2.0 }, parent: 'u', children: [] },
	+	},
	+	}];
	+	writeFileSync(file, JSON.stringify(convo));
	+	try {
	+	await main(['--from', 'chatgpt', '--file', file, '--dir', dir, '--report', '--analysis', '--redact-auto', '--quiet']);
	+	const artifacts = [
	+	'PROMPT_TREE.md', 'TREETRACE_REPORT.md', '.treetrace/tree.json',
	+	'.treetrace/failures.json', '.treetrace/lessons.md', '.treetrace/evals.jsonl', '.treetrace/agent-memory.md',
	+	].filter((f) => existsSync(join(dir, f))).map((f) => readFileSync(join(dir, f), 'utf8')).join('\n');
	+	assert.ok(!artifacts.includes(rawValue), 'raw short escaped-JSON secret leaked into an artifact');
	+	assert.ok(artifacts.includes('[REDACTED:secret-assignment]'), 'expected a secret-assignment redaction marker');
	+	} finally {
	+	rmSync(dir, { recursive: true, force: true });
	+	}
	+	});
	+
	+	test('P1: a single strong security signal stays verified at exactly 0.95', () => {
	+	const node = {
	+	id: 'node_001', text: 'harden auth', title: 'harden auth', kind: 'root', status: 'accepted', parent: null,
	+	actions: [{ tool: 'Edit', file: 'src/auth/session.ts', command: null, model: 'm' }],
	+	};
	+	const sec = analyzeTree({ nodes: [node] }).failures.find((f) => f.type === 'security_or_privacy_risk');
	+	assert.ok(sec && sec.tier === 'verified' && sec.confidence === 0.95, 'strong anchor must remain verified/0.95');
	+	});
	+
	+	test('P1: confidence is derived from corroboration and the contributing signals are in the evidence', () => {
	+	// Many independent signals (credential content + credential file + risky cmd + surface) vs one weak keyword.
	+	const strong = {
	+	id: 'node_001', text: 'deploy', title: 'deploy', kind: 'root', status: 'accepted', parent: null,
	+	actions: [{ tool: 'Bash', file: 'src/auth/session.ts', command: '. /srv/app/.env; rm -rf /tmp/x; chmod 777 /etc', input: '. /srv/app/.env; rm -rf /tmp/x; chmod 777 /etc', model: 'm' }],
	+	};
	+	const strongSec = analyzeTree({ nodes: [strong] }).failures.find((f) => f.type === 'security_or_privacy_risk');
	+	assert.equal(strongSec.tier, 'verified');
	+	assert.ok(/signals:/.test(strongSec.evidence), 'evidence must list the contributing signals (auditable)');
	+	assert.ok(/strong credential content/.test(strongSec.evidence), 'evidence must name the strong credential signal');
	+
	+	const weak = {
	+	id: 'node_001', text: 'edit detector', title: 'x', kind: 'root', status: 'accepted', parent: null,
	+	actions: [{ tool: 'Edit', file: 'src/analyze.js', input: 'const ACCESS = /rbac/i;', command: null, model: 'm' }],
	+	};
	+	const weakSec = analyzeTree({ nodes: [weak] }).failures.find((f) => f.type === 'security_or_privacy_risk');
	+	// Derived: the lone-weak-keyword score must be strictly below the strong score.
	+	assert.ok(weakSec.confidence < strongSec.confidence, 'lone weak keyword must score below a multi-signal strong event');
	+	});
	+
	+	test('P2: afterFailure does not link a corrector that precedes its failure when timestamps are missing', () => {
	+	// Ingestion ordinal (node id suffix) is the tiebreak: node_001 precedes node_002 in the stream.
	+	const failure = {
	+	id: 'node_002', text: 'the deck still does not render here', title: 'still broken', kind: 'direction', status: 'accepted', parent: null,
	+	actions: [{ tool: 'Edit', file: 'site/deck/index.html', command: null, input: null, model: 'm' }],
	+	};
	+	const earlier = {
	+	id: 'node_001', text: 'no that is wrong redo the deck here please', title: 'redo', kind: 'correction', status: 'accepted', parent: failure,
	+	actions: [{ tool: 'Edit', file: 'site/deck/index.html', command: null, input: null, model: 'm' }],
	+	};
	+	const analysis = analyzeTree({ nodes: [failure, earlier] });
	+	for (const f of analysis.failures) {
	+	if (!f.correctedByNodeId) continue;
	+	const fo = Number(/(\d+)$/.exec(f.firstSeenNodeId)[1]);
	+	const co = Number(/(\d+)$/.exec(f.correctedByNodeId)[1]);
	+	assert.ok(co >= fo, `failure ${f.id} corrected by an earlier-ordinal node`);
	+	}
	+	});
	+
	+	test('P2: resolvedBy is null when no resolution ties back to the failure, instead of the temporally-nearest node', () => {
	+	const failure = {
	+	id: 'node_001', text: 'do not hardcode the database url into the config file please', title: 'no hardcoding', kind: 'correction', status: 'accepted', parent: null,
	+	ts: '2026-06-12T10:00:00.000Z', actions: [{ tool: 'Edit', file: 'config/db.ts', command: null, input: null, model: 'm' }],
	+	};
	+	const unrelatedLater = {
	+	id: 'node_002', text: 'now lets switch topics entirely and write the marketing landing copy', title: 'marketing', kind: 'direction', status: 'accepted', parent: failure,
	+	ts: '2026-06-12T11:00:00.000Z', actions: [{ tool: 'Edit', file: 'site/index.html', command: null, input: null, model: 'm' }],
	+	};
	+	const analysis = analyzeTree({ nodes: [failure, unrelatedLater] });
	+	for (const chain of analysis.correctionChains) {
	+	// The unrelated later node shares neither file nor surface token nor acceptance phrasing.
	+	assert.notEqual(chain.resolvedNodeId, 'node_002', 'must not resolve to an unrelated temporally-nearest node');
	+	}
	+	});
	+
	+	test('P2: an explicit acceptance turn IS accepted as a resolution even with no shared evidence', () => {
	+	// The failure/correction share a file (so they link), but the acceptance turn shares
	+	// NOTHING structural with the failure -- only its acceptance phrasing can recover it as
	+	// the resolution. This proves the acceptance path, not temporal-nearest guessing.
	+	const failure = {
	+	id: 'node_001', text: 'the checkout total is off by a cent on tax rounding', title: 'rounding bug', kind: 'direction', status: 'accepted', parent: null,
	+	ts: '2026-06-12T10:00:00.000Z', actions: [{ tool: 'Edit', file: 'src/checkout/total.ts', command: null, input: null, model: 'm' }],
	+	};
	+	const correction = {
	+	id: 'node_002', text: 'no the checkout total rounding is still wrong, redo the total calc', title: 'still wrong', kind: 'correction', status: 'accepted', parent: failure,
	+	ts: '2026-06-12T10:30:00.000Z', actions: [{ tool: 'Edit', file: 'src/checkout/total.ts', command: null, input: null, model: 'm' }],
	+	};
	+	const accepted = {
	+	id: 'node_003', text: 'perfect, that works now', title: 'works', kind: 'direction', status: 'accepted', parent: correction,
	+	ts: '2026-06-12T11:00:00.000Z', actions: [{ tool: 'Edit', file: 'src/unrelated/widget.ts', command: null, input: null, model: 'm' }],
	+	};
	+	const analysis = analyzeTree({ nodes: [failure, correction, accepted] });
	+	// failure + correction share total.ts, so a chain forms; the acceptance turn (node_003)
	+	// shares no file/surface with the failure, so only its acceptance phrasing can recover it
	+	// as the resolution -- proving the acceptance path, not temporal-nearest guessing.
	+	assert.ok(
	+	analysis.correctionChains.some((c) => c.resolvedNodeId === 'node_003'),
	+	'the explicit acceptance turn should be recorded as the resolution'
	+	);
	+	});
	+
	+	test('P3: a node that leaks a secret and runs a risky command surfaces both kinds', () => {
	+	const node = {
	+	id: 'node_001', text: 'deploy', title: 'deploy', kind: 'root', status: 'accepted', parent: null,
	+	actions: [{ tool: 'Bash', file: null, command: '. /srv/app/.env; rm -rf /var/data', input: '. /srv/app/.env; rm -rf /var/data', model: 'm' }],
	+	};
	+	const sec = analyzeTree({ nodes: [node] }).failures.find((f) => f.type === 'security_or_privacy_risk');
	+	assert.ok(/credential/.test(sec.evidence) && /risky-command/.test(sec.evidence), `both kinds must appear: ${sec.evidence}`);
	+	});
	+
	+	test('P3: inferSignals can return multiple process kinds for a multi-class correction', () => {
	+	const root = { id: 'node_001', text: 'build a dashboard', title: 'x', kind: 'root', status: 'accepted', parent: null, actions: [] };
	+	const corr = {
	+	id: 'node_002', kind: 'correction', status: 'accepted', parent: root, actions: [],
	+	text: 'no, you ignored what i asked for and this is overbuilt, scrap the web app, keep it minimal',
	+	title: 'multi-class correction',
	+	};
	+	const analysis = analyzeTree({ nodes: [root, corr] });
	+	const types = new Set(analysis.failures.map((f) => f.type));
	+	assert.ok(types.size >= 2, `expected multiple process labels, got ${[...types].join(', ')}`);
	+	});
	+
	+	test('P4: a bare rbac keyword with no co-signal stays inferred, never high/verified', () => {
	+	const node = {
	+	id: 'node_001', text: 'edit detector', title: 'x', kind: 'root', status: 'accepted', parent: null,
	+	actions: [{ tool: 'Edit', file: 'src/analyze.js', input: 'const ACCESS_CONTROL_WEAK_RE = /rbac\|access-control/i;', command: null, model: 'm' }],
	+	};
	+	const sec = analyzeTree({ nodes: [node] }).failures.find((f) => f.type === 'security_or_privacy_risk');
	+	assert.ok(sec && sec.tier === 'inferred', `lone weak keyword must be inferred (got ${sec && sec.tier})`);
	+	});
	+
	+	test('P4: a bare rbac keyword WITH a security-surface co-signal earns high tier', () => {
	+	const node = {
	+	id: 'node_001', text: 'wire up access control', title: 'x', kind: 'root', status: 'accepted', parent: null,
	+	actions: [{ tool: 'Edit', file: 'src/rbac/policy.ts', input: 'enable rbac for the route', command: null, model: 'm' }],
	+	};
	+	const sec = analyzeTree({ nodes: [node] }).failures.find((f) => f.type === 'security_or_privacy_risk');
	+	assert.ok(sec && (sec.tier === 'high' \|\| sec.tier === 'verified'), `keyword + surface co-signal should tier up (got ${sec && sec.tier})`);
	+	});
	+
	+	test('P6: a human security correction backstops a prior action that carried no security label', () => {
	+	const prior = {
	+	id: 'node_001', text: 'put the deploy config value directly into the deploy script', title: 'deploy config', kind: 'direction', status: 'accepted', parent: null,
	+	actions: [{ tool: 'Edit', file: 'deploy.sh', command: null, input: null, model: 'm' }],
	+	};
	+	const correction = {
	+	id: 'node_002', text: 'that is a secret, rotate that key and do not commit it to the deploy script', title: 'rotate', kind: 'correction', status: 'accepted', parent: prior,
	+	actions: [{ tool: 'Edit', file: 'deploy.sh', command: null, input: null, model: 'm' }],
	+	};
	+	const analysis = analyzeTree({ nodes: [prior, correction] });
	+	const sec = analysis.failures.find((f) => f.type === 'security_or_privacy_risk');
	+	assert.ok(sec, 'human security correction should backstop a missed security event');
	+	assert.equal(sec.tier, 'inferred', 'the backstop must be inferred only, never strong/verified');
	+	assert.ok(sec.confidence <= 0.7, 'the backstop confidence must stay low');
	+	});
	+
	+	test('P6: the backstop never fabricates a strong/verified security label from prose alone', () => {
	+	const root = { id: 'node_001', text: 'build the cli', title: 'x', kind: 'root', status: 'accepted', parent: null, actions: [] };
	+	const correction = {
	+	id: 'node_002', text: 'never leak the api secret token again', title: 'no leaks', kind: 'correction', status: 'accepted', parent: root, actions: [],
	+	};
	+	const analysis = analyzeTree({ nodes: [root, correction] });
	+	const strongSec = analysis.failures.filter((f) => f.type === 'security_or_privacy_risk' && (f.tier === 'verified' \|\| f.tier === 'high'));
	+	assert.equal(strongSec.length, 0, 'a human-correction backstop must never mint strong/verified labels');
	+	});
	+
	+	// RELEASE GATE: the negative corpus must produce ZERO security/failure/hallucination false positives.
	+	test('NEGATIVE CORPUS (release gate): benign inputs produce zero security/failure false positives', () => {
	+	const dir = tempProject();
	+	// Benign prompts that historically tripped keyword/substring/path false positives.
	+	const benign = [
	+	'capture a screenshot with chrome --headless --force-device-scale-factor=1 --screenshot=out.png',
	+	'edit src/ui/semantic-tokens.ts to adjust the design token palette',
	+	'update theme/design-tokens.json and src/lexer/tokenizer.ts for the new theme',
	+	'the access-control documentation mentions rbac as a concept; just explaining it in the readme',
	+	'we use JSON.parse and params.arguments and test.skip in the code, no changes needed',
	+	'add a token field to the response schema and document the bearer header format in the api guide',
	+	'rename the file from auth-helpers.md to authentication-notes.md in the docs folder',
	+	'the password strength meter component needs a tooltip, purely a UI label',
	+	];
	+	try {
	+	// The benign corpus references real files; create them so any hallucination flag is a
	+	// genuine false positive rather than a correct missing-file detection.
	+	mkdirSync(join(dir, 'src', 'ui'), { recursive: true });
	+	mkdirSync(join(dir, 'src', 'lexer'), { recursive: true });
	+	mkdirSync(join(dir, 'theme'), { recursive: true });
	+	mkdirSync(join(dir, 'docs'), { recursive: true });
	+	writeFileSync(join(dir, 'out.png'), 'x');
	+	writeFileSync(join(dir, 'src', 'ui', 'semantic-tokens.ts'), 'export const t = 1;\n');
	+	writeFileSync(join(dir, 'src', 'lexer', 'tokenizer.ts'), 'export const t = 1;\n');
	+	writeFileSync(join(dir, 'theme', 'design-tokens.json'), '{}');
	+	writeFileSync(join(dir, 'auth-helpers.md'), '# notes\n');
	+	writeFileSync(join(dir, 'authentication-notes.md'), '# notes\n');
	+	writeFileSync(join(dir, 'readme'), 'rbac is a concept\n');
	+
	+	const nodes = benign.map((text, i) => ({
	+	id: `node_${String(i + 1).padStart(3, '0')}`,
	+	text, title: text.slice(0, 40), kind: i === 0 ? 'root' : 'direction',
	+	status: 'accepted', parent: null,
	+	ts: `2026-06-12T${String(10 + i).padStart(2, '0')}:00:00.000Z`,
	+	// Benign UI/doc file edits, plus the chrome flag command.
	+	actions: i === 0
	+	? [{ tool: 'Bash', file: null, command: 'chrome --headless --force-device-scale-factor=1 --screenshot=out.png', model: 'm' }]
	+	: i === 1 ? [{ tool: 'Edit', file: 'src/ui/semantic-tokens.ts', model: 'm' }]
	+	: i === 2 ? [{ tool: 'Edit', file: 'theme/design-tokens.json', model: 'm' }]
	+	: [],
	+	}));
	+	for (let k = 1; k < nodes.length; k++) nodes[k].parent = nodes[k - 1];
	+
	+	const analysis = analyzeTree({ nodes: nodes.map((n) => ({ ...n })) });
	+	const secFps = analysis.failures.filter((f) => f.type === 'security_or_privacy_risk');
	+	assert.equal(secFps.length, 0, `negative corpus minted security false positives: ${JSON.stringify(secFps.map((f) => f.evidence))}`);
	+
	+	const halluc = detectHallucinations({ nodes: nodes.map((n) => ({ ...n })) }, dir).hallucinations;
	+	assert.equal(halluc.length, 0, `negative corpus minted hallucination false positives: ${JSON.stringify(halluc.map((h) => h.reference))}`);
	+
	+	// Redaction must not over-fire high/medium on benign prose.
	+	for (const text of benign) {
	+	const hi = scanText(text).filter((f) => f.severity === 'high' \|\| f.severity === 'medium');
	+	assert.equal(hi.length, 0, `redaction over-fired on benign text "${text}": ${JSON.stringify(hi.map((f) => f.ruleId))}`);
	+	}
	+	} finally {
	+	rmSync(dir, { recursive: true, force: true });
	+	}
	+	});