| @@ -2,6 +2,23 @@ | ||
| Notable changes to TreeTrace. The format follows Keep a Changelog, and the project uses semantic versioning. | ||
| + | ## 0.10.0 - 2026-06-19 | |
| + | ||
| + | ### Added | |
| + | ||
| + | - `--each` writes one full, redacted report bundle per session into `--out-dir` (default `treetrace-reports/`), plus `INDEX.md` and `index.json` manifests summarizing prompts, corrections, rejections, and security flags per session. Batch runs auto-redact and fail closed. | |
| + | - `--deterministic` pins the generation timestamp so re-running on the same session produces byte-identical artifacts, for reproducible audit bundles and stable diffs. | |
| + | - Model-refusal capture now works on plain `User:` / `Assistant:` transcripts (inline and multi-line assistant turns) and on the ChatGPT, Codex, Cursor, and Gemini adapters, not only native Claude Code sessions. User text declines are also captured on plain transcripts. | |
| + | ||
| + | ### Changed | |
| + | ||
| + | - Refusal overrides are no longer promoted into correction chains or eval candidates: a human turn the model refused, or the push-back immediately after a refusal, is excluded from misunderstood-goal and security-intent promotion, and refusal or decline failure types no longer quote the refused content as a requirement to honor. The refusal itself is still recorded. | |
| + | - `loadRedactedTree` is split into `collectSessions` and `treeFromSessions` so a single run can build one tree per session; existing single-run behavior is unchanged. | |
| + | ||
| + | ### Documentation | |
| + | ||
| + | - README documents `--each` and `--deterministic`, and the signal-coverage matrix reflects the broadened refusal capture. | |
| + | ||
| ## 0.9.2 - 2026-06-19 | ||
| ### Documentation |
| @@ -108,7 +108,7 @@ TreeTrace reads coding and CLI agent sessions (Claude Code, Codex, Cursor, Copil | ||
| ### Signal coverage by adapter | ||
| - | Signal coverage depends on what each tool exports. The matrix below reflects the actual source code (v0.9.1); cells marked `--` are confirmed absent. | |
| + | Signal coverage depends on what each tool exports. The matrix below reflects the actual source code (v0.10.0); cells marked `--` are confirmed absent. A plain `User:` / `Assistant:` transcript imported with `--from transcript` also captures prompt lineage, corrections, model refusals, and user declines. | |
| | Signal | Claude Code | ChatGPT | Codex | Cursor | Copilot | Gemini | Grok | | ||
| |---|:---:|:---:|:---:|:---:|:---:|:---:|:---:| | ||
| @@ -121,12 +121,14 @@ Signal coverage depends on what each tool exports. The matrix below reflects the | ||
| | Tool uses | full | partial | full | full | full | full | partial | | ||
| | Files touched | full | -- | full | full | full | full | -- | | ||
| | Bash commands | full | -- | partial | partial | partial | partial | -- | | ||
| - | | Refusals / denials | full | -- | -- | -- | -- | -- | -- | | |
| + | | Refusals / denials | full | partial | partial | partial | -- | partial | -- | | |
| | Thinking / reasoning blocks | partial | -- | full | -- | -- | full | -- | | ||
| | Timestamps (first/last) | full | partial | partial | partial | partial | partial | partial | | ||
| | Per-turn latency | -- | -- | -- | -- | -- | -- | -- | | ||
| | Corrections / scope-changes | full | full | full | full | full | full | full | | ||
| - | | Rejections by kind | full | -- | -- | -- | -- | -- | -- | | |
| + | | Rejections by kind | full | partial | partial | partial | -- | partial | -- | | |
| + | ||
| + | Refusal capture: `full` on Claude Code (model refusal by text and stop-reason, user declines, tool-permission denials); `partial` on ChatGPT, Codex, Cursor, and Gemini (assistant-text model refusals). Copilot and Grok exports do not currently surface refusal signals. | |
| **Cell key:** `full` - extracted and stored in schema field. `partial` - extracted where the source format exposes it. `--` - not captured; confirmed absent in source code. | ||
| @@ -182,6 +184,8 @@ Claude Code (native JSONL) is the richest source: it covers all rejection kinds, | ||
| | `npx treetrace --memory` | Write and print `.treetrace/agent-memory.md` | | ||
| | `npx treetrace --graph` | Write `PROMPT_TREE_GRAPH.md`, a branded Mermaid graph that renders free on GitHub with no dependencies; large projects auto-summarize, and `--full` or `--summary` force a mode | | ||
| | `npx treetrace --security` | Print a security-focused report and write `.treetrace/hallucinations.json` | | ||
| + | | `npx treetrace --each` | Write one full report bundle per session into `--out-dir` (default `treetrace-reports/`), plus `INDEX.md` and `index.json` manifests; auto-redacts each bundle and fails closed | | |
| + | | `npx treetrace --deterministic` | Pin the generation timestamp so re-running on the same session produces byte-identical artifacts | | |
| | `npx treetrace mcp` | Start a read-only MCP server over stdio | | ||
| | `npx treetrace --titles-only` | Compact human tree, no full prompt details | | ||
| | `npx treetrace --redact-auto` | Redact every detected secret without prompting | |
| @@ -1,6 +1,6 @@ | ||
| { | ||
| "name": "treetrace", | ||
| - | "version": "0.9.2", | |
| + | "version": "0.10.0", | |
| "description": "Prompt-lineage visibility for coding & CLI agent sessions. Turns a raw session into a structured, local record of every correction, refusal, token, and tool - for audit, training data, and token efficiency.", | ||
| "keywords": [ | ||
| "claude-code", |
| @@ -1,4 +1,4 @@ | ||
| - | import { newSession, finalizeSession, pushTurn, flattenParts, looksSynthetic } from './shared.js'; | |
| + | import { newSession, finalizeSession, pushTurn, flattenParts, looksSynthetic, noteAssistantRefusal } from './shared.js'; | |
| function conversationList(parsed) { | ||
| if (Array.isArray(parsed)) return parsed; | ||
| @@ -46,6 +46,7 @@ function sessionFromConversation(convo, path, index) { | ||
| } else if (role === 'assistant') { | ||
| session.stats.assistantLines++; | ||
| if (msg.metadata && msg.metadata.model_slug) session.stats.models.add(msg.metadata.model_slug); | ||
| + | noteAssistantRefusal(session, text); | |
| } else if (role === 'tool') { | ||
| session.stats.toolUses++; | ||
| } |
| @@ -6,6 +6,7 @@ import { | ||
| addThinking, | ||
| flattenParts, | ||
| looksSynthetic, | ||
| + | noteAssistantRefusal, | |
| readJsonl, | ||
| } from './shared.js'; | ||
| @@ -50,6 +51,7 @@ export function parseCodex(text, path, sessionId) { | ||
| pushTurn(session, ++turn, body, ts); | ||
| } else if (payload.role === 'assistant') { | ||
| session.stats.assistantLines++; | ||
| + | noteAssistantRefusal(session, flattenParts(payload.content)); | |
| } | ||
| continue; | ||
| } |
| @@ -1,4 +1,4 @@ | ||
| - | import { newSession, finalizeSession, pushTurn, addAction, looksSynthetic } from './shared.js'; | |
| + | import { newSession, finalizeSession, pushTurn, addAction, looksSynthetic, noteAssistantRefusal } from './shared.js'; | |
| function parseCursorParams(tfd) { | ||
| const raw = tfd && (tfd.params || tfd.rawArgs); | ||
| @@ -77,6 +77,7 @@ function parseExportedSession(parsed, path, sessionId) { | ||
| } else if (msg.role === 'assistant') { | ||
| session.stats.assistantLines++; | ||
| if (msg.model) session.stats.models.add(msg.model); | ||
| + | if (typeof msg.content === 'string') noteAssistantRefusal(session, msg.content); | |
| if (Array.isArray(msg.toolCalls)) { | ||
| for (const call of msg.toolCalls) { | ||
| session.stats.toolUses++; | ||
| @@ -150,6 +151,7 @@ export function parseCursor(parsed, path, sessionId) { | ||
| pushTurn(session, ++turn, text, ts); | ||
| } else { | ||
| session.stats.assistantLines++; | ||
| + | noteAssistantRefusal(session, bubbleText(bubble)); | |
| if (bubble.toolFormerData) { | ||
| session.stats.toolUses++; | ||
| const tfd = bubble.toolFormerData; |
| @@ -6,6 +6,7 @@ import { | ||
| addThinking, | ||
| flattenParts, | ||
| looksSynthetic, | ||
| + | noteAssistantRefusal, | |
| readJsonl, | ||
| } from './shared.js'; | ||
| @@ -32,6 +33,7 @@ function ingestRecord(session, rec, counters) { | ||
| } else if (type === 'gemini' || type === 'model' || type === 'assistant') { | ||
| session.stats.assistantLines++; | ||
| if (rec.model) session.stats.models.add(rec.model); | ||
| + | noteAssistantRefusal(session, partsToText(rec.content)); | |
| if (Array.isArray(rec.toolCalls)) { | ||
| for (const call of rec.toolCalls) { | ||
| session.stats.toolUses++; |
| @@ -1,3 +1,6 @@ | ||
| + | import { truncate } from '../util.js'; | |
| + | import { looksLikeRefusal } from '../parse.js'; | |
| + | ||
| export function emptyStats() { | ||
| return { | ||
| userLines: 0, | ||
| @@ -95,6 +98,24 @@ export function addRejection(session, rejection) { | ||
| session.stats.rejectionsByKind[rejection.kind] = (session.stats.rejectionsByKind[rejection.kind] || 0) + 1; | ||
| } | ||
| + | // Scan assistant turn text for a refusal and, if found, record a model_refusal | |
| + | // against the user prompt that triggered it (the current prompt). Mirrors the | |
| + | // native Claude-path text heuristic (source 'text_heuristic', confidence 0.7) | |
| + | // so structured-export adapters capture refusals instead of dropping them. | |
| + | export function noteAssistantRefusal(session, text) { | |
| + | if (!session || !session._currentPrompt) return; | |
| + | if (!looksLikeRefusal(text)) return; | |
| + | addRejection(session, { | |
| + | kind: 'model_refusal', | |
| + | source: 'text_heuristic', | |
| + | confidence: 0.7, | |
| + | toolUseId: null, | |
| + | tool: null, | |
| + | ts: null, | |
| + | evidence: truncate(typeof text === 'string' ? text : '', 160), | |
| + | }); | |
| + | } | |
| + | ||
| export function flattenParts(parts) { | ||
| if (typeof parts === 'string') return parts; | ||
| if (!Array.isArray(parts)) { |
| @@ -388,9 +388,15 @@ export function analyzeTree(tree) { | ||
| type: evalType, | ||
| task: evalTaskFor(type), | ||
| context: summary, | ||
| - | input: correctionNode | |
| - | ? `Honor this correction and keep building: "${quote(correctionNode.text)}"` | |
| - | : `Honor this stated requirement and keep building: "${quote(failureNode.text)}"`, | |
| + | // Refusal/decline failures are not "requirements to honor": quoting the | |
| + | // refused or declined text as an instruction would bake the (possibly | |
| + | // harmful) request into a regression case telling agents to comply. For | |
| + | // those types use the neutral task framing instead of quoting content. | |
| + | input: REFUSAL_INPUT_TYPES.has(type) | |
| + | ? evalTaskFor(type) | |
| + | : correctionNode | |
| + | ? `Honor this correction and keep building: "${quote(correctionNode.text)}"` | |
| + | : `Honor this stated requirement and keep building: "${quote(failureNode.text)}"`, | |
| expected_behavior: expectedBehaviorFor(type), | ||
| failure_mode: failureModeFor(type), | ||
| sourceNodeIds: ids, | ||
| @@ -431,6 +437,19 @@ export function analyzeTree(tree) { | ||
| return failure; | ||
| }; | ||
| + | // Refusal-adjacency. A turn that the model refused, or the human | |
| + | // push-back immediately after a refusal, must not be promoted into a | |
| + | // "honor this requirement/correction" eval, lesson, or correction chain: | |
| + | // that would bake the refused (often harmful) request into a regression case | |
| + | // telling future agents to comply. The refusal itself is still recorded by | |
| + | // the rejection-surfacing pass, and real agent-action security findings are | |
| + | // unaffected; only the intent/correction promotions are gated. | |
| + | const nodeHasModelRefusal = (n) => | |
| + | Array.isArray(n && n.rejections) && n.rejections.some((r) => r.kind === 'model_refusal'); | |
| + | // In-memory nodes link to their predecessor via `.parent` (an object ref); | |
| + | // `parentId` is only attached at render time, so walk `.parent` here. | |
| + | const refusalAdjacent = (node) => nodeHasModelRefusal(node) || nodeHasModelRefusal(node && node.parent); | |
| + | ||
| const securityNodeIds = new Set(); | ||
| tree.nodes.forEach((node, index) => { | ||
| // v0.3: rejection surfacing pass. Each captured rejection becomes a failure | ||
| @@ -484,7 +503,7 @@ export function analyzeTree(tree) { | ||
| summary: `An agent action touched auth, secrets, or access control near "${truncate(node.title, 90)}".`, | ||
| }); | ||
| securityNodeIds.add(node.id); | ||
| - | } else if (node.text.length <= 1200 && SECURITY_INTENT_RE.test(node.text)) { | |
| + | } else if (node.text.length <= 1200 && SECURITY_INTENT_RE.test(node.text) && !refusalAdjacent(node)) { | |
| addFailure({ | ||
| type: 'security_or_privacy_risk', | ||
| confidence: 0.7, | ||
| @@ -553,6 +572,10 @@ export function analyzeTree(tree) { | ||
| FRUSTRATION_HINT.test(node.text) || | ||
| PRIVACY_HINT.test(node.text); | ||
| if (!shouldAnalyze) return; | ||
| + | // Skip the misunderstood_goal / correction promotion for refusal | |
| + | // overrides. The refusal stays recorded; we just do not manufacture a | |
| + | // correction chain, eval, or lesson that honors the overridden request. | |
| + | if (refusalAdjacent(node)) return; | |
| const signals = inferSignals(node); | ||
| if (!signals.length) return; | ||
| @@ -1214,6 +1237,11 @@ function lessonFor(type, { evidence = '', summary = '' } = {}) { | ||
| }; | ||
| } | ||
| + | // Failure types that represent a refusal or a declined action rather than a | |
| + | // requirement the agent should honor. Their eval input uses the neutral task | |
| + | // framing (see addFailure) so refused content is never quoted as an instruction. | |
| + | const REFUSAL_INPUT_TYPES = new Set(['model_refused', 'user_rejected_action', 'permission_denied', 'tool_execution_failed']); | |
| + | ||
| function evalTypeFor(type) { | ||
| if (type === 'security_or_privacy_risk') return 'privacy_boundary_preservation'; | ||
| if (type === 'scope_drift' || type === 'overbuilt_solution') return 'scope_drift_detection'; |
| @@ -27,6 +27,11 @@ import { c, plural, truncate, TreetraceError, ExitCode } from './util.js'; | ||
| const VERSION = JSON.parse(readFileSync(new URL('../package.json', import.meta.url), 'utf8')).version; | ||
| + | // --deterministic pins the only run-to-run volatile field (the generation | |
| + | // timestamp) so re-running on the same session yields byte-identical artifacts, | |
| + | // for reproducible audit bundles and the "run twice, diff is empty" demo. | |
| + | const DETERMINISTIC_TIMESTAMP = '1970-01-01T00:00:00.000Z'; | |
| + | ||
| const HELP = `TreeTrace - turn AI coding sessions into regression-ready prompt lineage | ||
| Usage: | ||
| @@ -44,6 +49,7 @@ Usage: | ||
| treetrace --graph write a branded Mermaid prompt-tree graph (PROMPT_TREE_GRAPH.md) | ||
| large projects auto-summarize; --full / --summary force a mode | ||
| treetrace --security print a security-focused report for this session | ||
| + | treetrace --each write one report bundle per session (+ INDEX manifest) | |
| treetrace mcp start a read-only MCP server over stdio | ||
| Options: | ||
| @@ -64,6 +70,13 @@ Options: | ||
| for any value that also matches a named secret rule | ||
| --since <YYYY-MM-DD> only include sessions active on/after this date | ||
| (timestamped sessions only; plain transcripts are excluded) | ||
| + | --each write one full report bundle per session into --out-dir, | |
| + | plus INDEX.md and index.json manifests (batch / GRC mode; | |
| + | auto-redacts each bundle, fails closed) | |
| + | --out-dir <path> output root for --each (default: treetrace-reports/) | |
| + | --deterministic pin the generation timestamp so re-running on the same | |
| + | session produces byte-identical artifacts (reproducible | |
| + | audit bundles; clean run-twice diffs) | |
| --quiet suppress progress output | ||
| --version, --help | ||
| @@ -84,12 +97,14 @@ export async function main(argv) { | ||
| const projectName = detectProjectName(projectDir); | ||
| const log = opts.quiet ? () => {} : (msg) => process.stderr.write(`${msg}\n`); | ||
| + | if (opts.each) return await runEach(opts, projectDir, projectName, log); | |
| + | ||
| const { tree, decisions, asked, sourceTool } = await loadRedactedTree(opts, projectDir, projectName, log); | ||
| const ttDir = join(projectDir, '.treetrace'); | ||
| const decisionsPath = join(ttDir, 'redactions.json'); | ||
| - | const generatedAt = new Date().toISOString(); | |
| + | const generatedAt = opts.deterministic ? DETERMINISTIC_TIMESTAMP : new Date().toISOString(); | |
| const renderOpts = { projectName, titlesOnly: opts.titlesOnly, version: VERSION, generatedAt, sourceType: sourceTypeFor(sourceTool) }; | ||
| if (opts.handoff) { | ||
| @@ -206,7 +221,7 @@ export async function main(argv) { | ||
| if (asked) log(c.dim(` ${plural(asked, 'redaction decision')} saved to .treetrace/redactions.json`)); | ||
| } | ||
| - | export async function loadRedactedTree(opts, projectDir, projectName, log = () => {}, { forceAuto = false } = {}) { | |
| + | export async function collectSessions(opts, projectDir, projectName, log = () => {}) { | |
| let sessions = []; | ||
| let sourceTool = 'claude'; | ||
| if (opts.stdin) { | ||
| @@ -219,10 +234,12 @@ export async function loadRedactedTree(opts, projectDir, projectName, log = () = | ||
| sessions = [parsePlainTranscript(text)]; | ||
| sourceTool = 'transcript'; | ||
| } | ||
| + | for (const s of sessions) s.sourceTool = sourceTool; | |
| } else if (opts.files.length) { | ||
| const tools = new Set(); | ||
| for (const file of opts.files) { | ||
| const { sessions: fileSessions, tool } = await ingestFile(file, opts.from, log); | ||
| + | for (const s of fileSessions) s.sourceTool = tool; | |
| sessions.push(...fileSessions); | ||
| tools.add(tool); | ||
| } | ||
| @@ -247,7 +264,9 @@ export async function loadRedactedTree(opts, projectDir, projectName, log = () = | ||
| for (const meta of filtered) { | ||
| if (meta.sizeBytes > 5 * 1048576) | ||
| log(c.dim(` parsing ${meta.sessionId.slice(0, 8)}... (${(meta.sizeBytes / 1048576).toFixed(0)} MB)`)); | ||
| - | sessions.push(await parseSessionFile(meta.path, meta)); | |
| + | const parsed = await parseSessionFile(meta.path, meta); | |
| + | parsed.sourceTool = 'claude'; | |
| + | sessions.push(parsed); | |
| } | ||
| } | ||
| @@ -262,6 +281,10 @@ export async function loadRedactedTree(opts, projectDir, projectName, log = () = | ||
| } | ||
| } | ||
| + | return { sessions, sourceTool }; | |
| + | } | |
| + | ||
| + | export async function treeFromSessions(sessions, opts, projectDir, log = () => {}, { forceAuto = false } = {}) { | |
| const nodes = classifyPrompts(sessions); | ||
| if (!nodes.length) { | ||
| throw new TreetraceError('no human prompts found in these sessions, nothing to trace.', ExitCode.NO_DATA); | ||
| @@ -345,6 +368,12 @@ export async function loadRedactedTree(opts, projectDir, projectName, log = () = | ||
| } | ||
| analyzeTree(tree); | ||
| + | return { tree, decisions, asked }; | |
| + | } | |
| + | ||
| + | export async function loadRedactedTree(opts, projectDir, projectName, log = () => {}, { forceAuto = false } = {}) { | |
| + | const { sessions, sourceTool } = await collectSessions(opts, projectDir, projectName, log); | |
| + | const { tree, decisions, asked } = await treeFromSessions(sessions, opts, projectDir, log, { forceAuto }); | |
| return { tree, decisions, asked, sourceTool }; | ||
| } | ||
| @@ -449,6 +478,147 @@ function requestedArtifacts(opts, artifacts) { | ||
| return requested; | ||
| } | ||
| + | // --each: one full report bundle per session into --out-dir, plus an INDEX | |
| + | // manifest. Each session becomes its own tree so the bundle is a standalone, | |
| + | // auditor-defensible record. Batch is inherently unattended, so every bundle | |
| + | // auto-redacts and fails closed (forceAuto), matching the redaction-gate ethos. | |
| + | async function runEach(opts, projectDir, projectName, log) { | |
| + | const { sessions, sourceTool } = await collectSessions(opts, projectDir, projectName, log); | |
| + | const outRoot = resolve(projectDir, opts.outDir || 'treetrace-reports'); | |
| + | const generatedAt = opts.deterministic ? DETERMINISTIC_TIMESTAMP : new Date().toISOString(); | |
| + | const manifest = []; | |
| + | const usedLabels = new Set(); | |
| + | let idx = 0; | |
| + | for (const session of sessions) { | |
| + | idx++; | |
| + | let built; | |
| + | try { | |
| + | built = await treeFromSessions([session], opts, projectDir, log, { forceAuto: true }); | |
| + | } catch (err) { | |
| + | if (err instanceof TreetraceError && err.code === ExitCode.NO_DATA) { | |
| + | log(c.dim(` skip ${session.sessionId || `session-${idx}`}: nothing to trace`)); | |
| + | continue; | |
| + | } | |
| + | throw err; | |
| + | } | |
| + | const { tree, decisions } = built; | |
| + | const sessionTool = session.sourceTool || sourceTool; | |
| + | const label = uniqueLabel(session.sessionId, idx, usedLabels); | |
| + | const targetDir = join(outRoot, label); | |
| + | const renderOpts = { | |
| + | projectName, | |
| + | titlesOnly: opts.titlesOnly, | |
| + | version: VERSION, | |
| + | generatedAt, | |
| + | sourceType: sourceTypeFor(sessionTool), | |
| + | }; | |
| + | writeBundle(targetDir, tree, decisions, renderOpts, projectDir); | |
| + | manifest.push(summarizeSession(label, session, tree, sessionTool, targetDir, projectDir)); | |
| + | log(c.green(`โ ${label} ยท ${plural(tree.stats.promptCount, 'prompt')} -> ${relativeish(targetDir, projectDir)}`)); | |
| + | } | |
| + | if (!manifest.length) { | |
| + | throw new TreetraceError('no sessions produced a report (nothing to trace).', ExitCode.NO_DATA); | |
| + | } | |
| + | writeManifest(outRoot, manifest, projectName, generatedAt); | |
| + | log(''); | |
| + | log(`${c.green('ok')} wrote ${plural(manifest.length, 'session report')} to ${c.bold(relativeish(outRoot, projectDir))} (see INDEX.md)`); | |
| + | } | |
| + | ||
| + | function uniqueLabel(sessionId, idx, used) { | |
| + | let base = String(sessionId || `session-${idx}`).replace(/[^A-Za-z0-9._-]/g, '-').replace(/^-+|-+$/g, ''); | |
| + | if (!base) base = `session-${idx}`; | |
| + | if (base.length > 64) base = base.slice(0, 64); | |
| + | let label = base; | |
| + | let n = 2; | |
| + | while (used.has(label)) label = `${base}-${n++}`; | |
| + | used.add(label); | |
| + | return label; | |
| + | } | |
| + | ||
| + | function writeBundle(targetDir, tree, decisions, renderOpts, projectDir) { | |
| + | const ttDir = join(targetDir, '.treetrace'); | |
| + | let md = renderMarkdown(tree, renderOpts); | |
| + | let jsonText = JSON.stringify(renderJson(tree, renderOpts), null, 2); | |
| + | const artifacts = analysisArtifacts(ttDir, tree, renderOpts, projectDir); | |
| + | let report = renderReportMarkdown(tree, renderOpts); | |
| + | md = assertClean(md, decisions, 'PROMPT_TREE.md', true); | |
| + | jsonText = assertClean(jsonText, decisions, 'tree.json', true); | |
| + | for (const artifact of Object.values(artifacts)) { | |
| + | artifact.text = assertClean(artifact.text, decisions, artifact.label, true); | |
| + | } | |
| + | report = assertClean(report, decisions, 'TREETRACE_REPORT.md', true); | |
| + | mkdirSync(targetDir, { recursive: true }); | |
| + | mkdirSync(ttDir, { recursive: true }); | |
| + | writeFileSync(join(targetDir, 'PROMPT_TREE.md'), md); | |
| + | writeFileSync(join(targetDir, 'TREETRACE_REPORT.md'), report); | |
| + | writeFileSync(join(ttDir, 'tree.json'), jsonText); | |
| + | for (const artifact of Object.values(artifacts)) writeFileSync(artifact.path, artifact.text); | |
| + | writeFileSync(join(ttDir, 'redactions.json'), JSON.stringify(decisions, null, 2)); | |
| + | } | |
| + | ||
| + | function summarizeSession(label, session, tree, sourceTool, targetDir, projectDir) { | |
| + | const s = tree.stats; | |
| + | const summary = (tree.analysis && tree.analysis.summary) || analyzeTree(tree).summary; | |
| + | const secEntry = (summary.topFailureTypes || []).find((t) => t.type === 'security_or_privacy_risk'); | |
| + | return { | |
| + | label, | |
| + | sessionId: session.sessionId || null, | |
| + | source: sourceTool, | |
| + | prompts: s.promptCount, | |
| + | corrections: s.corrections || 0, | |
| + | abandonedBranches: s.abandonedBranches || 0, | |
| + | rejections: s.rejections || 0, | |
| + | securityFlags: secEntry ? secEntry.count : 0, | |
| + | failureSignals: summary.totalFailureSignals || 0, | |
| + | correctionChains: summary.correctionChains || 0, | |
| + | models: s.models || [], | |
| + | firstTs: s.firstTs || null, | |
| + | lastTs: s.lastTs || null, | |
| + | dir: relativeish(targetDir, projectDir), | |
| + | }; | |
| + | } | |
| + | ||
| + | function writeManifest(outRoot, manifest, projectName, generatedAt) { | |
| + | const totals = { | |
| + | prompts: manifest.reduce((a, m) => a + (m.prompts || 0), 0), | |
| + | corrections: manifest.reduce((a, m) => a + (m.corrections || 0), 0), | |
| + | rejections: manifest.reduce((a, m) => a + (m.rejections || 0), 0), | |
| + | securityFlags: manifest.reduce((a, m) => a + (m.securityFlags || 0), 0), | |
| + | failureSignals: manifest.reduce((a, m) => a + (m.failureSignals || 0), 0), | |
| + | }; | |
| + | const indexJson = { | |
| + | schemaVersion: 1, | |
| + | project: projectName, | |
| + | generatedAt, | |
| + | sessionCount: manifest.length, | |
| + | totals, | |
| + | sessions: manifest, | |
| + | }; | |
| + | mkdirSync(outRoot, { recursive: true }); | |
| + | writeFileSync(join(outRoot, 'index.json'), JSON.stringify(indexJson, null, 2)); | |
| + | writeFileSync(join(outRoot, 'INDEX.md'), renderManifestMarkdown(indexJson)); | |
| + | } | |
| + | ||
| + | function renderManifestMarkdown(index) { | |
| + | const lines = []; | |
| + | lines.push(`# TreeTrace session reports: ${index.project}`); | |
| + | lines.push(''); | |
| + | lines.push( | |
| + | `${index.sessionCount} sessions ยท ${index.totals.prompts} prompts ยท ${index.totals.corrections} corrections ยท ` + | |
| + | `${index.totals.rejections} rejections ยท ${index.totals.securityFlags} security flags` | |
| + | ); | |
| + | lines.push(''); | |
| + | lines.push('| Session | Source | Prompts | Corrections | Rejections | Security | Report |'); | |
| + | lines.push('|---|---|---|---|---|---|---|'); | |
| + | for (const m of index.sessions) { | |
| + | lines.push( | |
| + | `| ${m.label} | ${m.source} | ${m.prompts} | ${m.corrections} | ${m.rejections} | ${m.securityFlags} | [report](${m.label}/TREETRACE_REPORT.md) |` | |
| + | ); | |
| + | } | |
| + | lines.push(''); | |
| + | return lines.join('\n'); | |
| + | } | |
| + | ||
| export function assertClean(rendered, decisions, label, autoRedact = false) { | ||
| if (autoRedact) { | ||
| return patchResiduals(rendered, decisions); | ||
| @@ -575,9 +745,12 @@ export function parseArgs(argv) { | ||
| quiet: false, | ||
| help: false, | ||
| version: false, | ||
| + | each: false, | |
| + | deterministic: false, | |
| from: null, | ||
| dir: null, | ||
| out: null, | ||
| + | outDir: null, | |
| reportFile: null, | ||
| since: null, | ||
| }; | ||
| @@ -617,6 +790,9 @@ export function parseArgs(argv) { | ||
| case '--redact-auto': opts.redactAuto = true; break; | ||
| case '--keep-git-shas': opts.keepGitShas = true; break; | ||
| case '--quiet': opts.quiet = true; break; | ||
| + | case '--deterministic': opts.deterministic = true; break; | |
| + | case '--each': opts.each = true; break; | |
| + | case '--out-dir': opts.outDir = requireValue('--out-dir'); break; | |
| case '--help': case '-h': opts.help = true; break; | ||
| case '--version': case '-v': opts.version = true; break; | ||
| case '--from': |
| @@ -37,7 +37,7 @@ function classifyToolResultRejection(content) { | ||
| return { kind: 'tool_execution_error', confidence: 0.9, evidence: truncate(text, 160) }; | ||
| } | ||
| - | function looksLikeRefusal(text) { | |
| + | export function looksLikeRefusal(text) { | |
| return typeof text === 'string' && text.length <= 4000 && REFUSAL_TEXT_RE.test(text); | ||
| } | ||
| @@ -582,26 +582,93 @@ export function parsePlainTranscript(text, label = 'pasted-transcript') { | ||
| /^(?:#{1,4}\s*)?(?:\*\*)?(assistant|ai|chatgpt|claude|gpt|gemini|model|response)(?:\*\*)?\s*[:--]?\s*/i; | ||
| const prompts = []; | ||
| - | let current = null; | |
| + | let current = null; // user prompt being accumulated | |
| + | let assistantBuf = null; // assistant turn text being accumulated, or null when not in an assistant turn | |
| let sawMarkers = false; | ||
| + | let assistantLines = 0; | |
| + | let rejectionCount = 0; | |
| + | const rejectionsByKind = Object.create(null); | |
| + | ||
| + | const record = (target, rejection) => { | |
| + | if (!target) return; | |
| + | if (!Array.isArray(target.rejections)) target.rejections = []; | |
| + | target.rejections.push(rejection); | |
| + | rejectionCount++; | |
| + | rejectionsByKind[rejection.kind] = (rejectionsByKind[rejection.kind] || 0) + 1; | |
| + | }; | |
| + | ||
| + | // An assistant turn just ended. If it reads as a refusal, attach a | |
| + | // model_refusal to the user prompt that triggered it (the last one pushed). | |
| + | // Mirrors the Claude-path text heuristic (source 'text_heuristic', confidence | |
| + | // 0.7) so the plain-transcript fallback produces the same audit signal a | |
| + | // structured session would, instead of silently dropping refusals. | |
| + | const flushAssistant = () => { | |
| + | if (assistantBuf == null) return; | |
| + | const atext = assistantBuf.trim(); | |
| + | if (atext) { | |
| + | assistantLines++; | |
| + | if (looksLikeRefusal(atext)) { | |
| + | record(prompts[prompts.length - 1], { | |
| + | kind: 'model_refusal', | |
| + | source: 'text_heuristic', | |
| + | confidence: 0.7, | |
| + | toolUseId: null, | |
| + | tool: null, | |
| + | ts: null, | |
| + | evidence: truncate(atext, 160), | |
| + | }); | |
| + | } | |
| + | } | |
| + | assistantBuf = null; | |
| + | }; | |
| + | ||
| + | // A user turn just ended. Push it, and if the text itself is a decline | |
| + | // ("no, stop", "don't do that"), attach a user_text_decline rejection, | |
| + | // matching ingestUser (source 'text', confidence 0.8). | |
| + | const flushUser = () => { | |
| + | if (current && current.text.trim()) { | |
| + | const utext = current.text.trim(); | |
| + | if (looksLikeUserTextDecline(utext)) { | |
| + | record(current, { | |
| + | kind: 'user_text_decline', | |
| + | source: 'text', | |
| + | confidence: 0.8, | |
| + | toolUseId: null, | |
| + | tool: null, | |
| + | ts: null, | |
| + | evidence: truncate(utext, 160), | |
| + | }); | |
| + | } | |
| + | prompts.push(current); | |
| + | } | |
| + | current = null; | |
| + | }; | |
| for (const line of lines) { | ||
| const userMatch = line.match(markers); | ||
| if (userMatch) { | ||
| sawMarkers = true; | ||
| - | if (current && current.text.trim()) prompts.push(current); | |
| - | current = { text: userMatch[3] ? `${userMatch[3]}\n` : '', uuid: null, parentUuid: null, ts: null }; | |
| + | flushAssistant(); | |
| + | flushUser(); | |
| + | current = { text: userMatch[3] ? `${userMatch[3]}\n` : '', uuid: null, parentUuid: null, ts: null, rejections: [] }; | |
| continue; | ||
| } | ||
| - | if (assistantMarkers.test(line)) { | |
| + | const assistantMatch = line.match(assistantMarkers); | |
| + | if (assistantMatch) { | |
| sawMarkers = true; | ||
| - | if (current && current.text.trim()) prompts.push(current); | |
| - | current = null; | |
| + | flushAssistant(); | |
| + | flushUser(); | |
| + | // Capture any text on the same line as the marker (e.g. "Assistant: I can't help"), | |
| + | // which is the common single-line shape in pasted chat exports. | |
| + | const inline = line.slice(assistantMatch[0].length); | |
| + | assistantBuf = inline ? `${inline}\n` : ''; | |
| continue; | ||
| } | ||
| if (current) current.text += `${line}\n`; | ||
| + | else if (assistantBuf != null) assistantBuf += `${line}\n`; | |
| } | ||
| - | if (current && current.text.trim()) prompts.push(current); | |
| + | flushAssistant(); | |
| + | flushUser(); | |
| if (!sawMarkers) { | ||
| throw new TreetraceError( | ||
| @@ -620,21 +687,21 @@ export function parsePlainTranscript(text, label = 'pasted-transcript') { | ||
| gitBranch: null, | ||
| firstTs: null, | ||
| lastTs: null, | ||
| - | prompts: prompts.map((p) => ({ ...p, text: p.text.trim(), actions: [], thinking: 0, rejections: [] })), | |
| + | prompts: prompts.map((p) => ({ ...p, text: p.text.trim(), actions: [], thinking: 0, rejections: p.rejections || [] })), | |
| index: new Map(), | ||
| leafUuid: null, | ||
| activeLeafUuid: null, | ||
| stats: { | ||
| userLines: prompts.length, | ||
| - | assistantLines: 0, | |
| + | assistantLines, | |
| toolUses: 0, | ||
| models: [], | ||
| filesTouched: [], | ||
| inputTokens: 0, | ||
| outputTokens: 0, | ||
| interruptions: 0, | ||
| - | rejections: 0, | |
| - | rejectionsByKind: {}, | |
| + | rejections: rejectionCount, | |
| + | rejectionsByKind: { ...rejectionsByKind }, | |
| }, | ||
| isContinuation: false, | ||
| }; |
| @@ -260,3 +260,42 @@ test('cursor import emits actions from exported tool calls for a verified securi | ||
| assert.ok(sec, 'cursor import should produce a verified security signal'); | ||
| assert.equal(sec.model, 'claude-sonnet-4-6'); | ||
| }); | ||
| + | ||
| + | test('adapters capture an assistant refusal as model_refusal', () => { | |
| + | // Gemini | |
| + | const gem = JSON.stringify({ sessionId: 'g1', messages: [ | |
| + | { type: 'user', content: '[disallowed ask]' }, | |
| + | { type: 'gemini', content: [{ text: "I'm sorry, I cannot help with that." }], model: 'gemini-3' }, | |
| + | ] }); | |
| + | assert.equal(adaptFrom('gemini', gem, 'g.json')[0].stats.rejectionsByKind.model_refusal, 1, 'gemini'); | |
| + | ||
| + | // Codex | |
| + | const cdx = [ | |
| + | JSON.stringify({ type: 'response_item', payload: { type: 'message', role: 'user', content: '[disallowed ask]' } }), | |
| + | JSON.stringify({ type: 'response_item', payload: { type: 'message', role: 'assistant', content: [{ type: 'text', text: 'I cannot help with that request.' }] } }), | |
| + | ].join('\n'); | |
| + | assert.equal(adaptFrom('codex', cdx, 'c.jsonl')[0].stats.rejectionsByKind.model_refusal, 1, 'codex'); | |
| + | ||
| + | // ChatGPT export (array of conversations) | |
| + | const cg = JSON.stringify([{ title: 't', mapping: { | |
| + | a: { id: 'a', message: { author: { role: 'user' }, content: { content_type: 'text', parts: ['[disallowed ask]'] }, create_time: 1 } }, | |
| + | b: { id: 'b', message: { author: { role: 'assistant' }, content: { content_type: 'text', parts: ["I'm sorry, I can't help with that."] }, create_time: 2 } }, | |
| + | } }]); | |
| + | assert.equal(adaptFrom('chatgpt', cg, 'x.json')[0].stats.rejectionsByKind.model_refusal, 1, 'chatgpt'); | |
| + | ||
| + | // Cursor exported session | |
| + | const cur = JSON.stringify({ messages: [ | |
| + | { role: 'user', content: '[disallowed ask]' }, | |
| + | { role: 'assistant', content: 'I cannot help with that.', model: 'claude-3.5' }, | |
| + | ], workspaceId: 'w' }); | |
| + | assert.equal(adaptFrom('cursor', cur, 'cur.json')[0].stats.rejectionsByKind.model_refusal, 1, 'cursor'); | |
| + | }); | |
| + | ||
| + | test('a benign assistant turn produces no false refusal', () => { | |
| + | const gem = JSON.stringify({ sessionId: 'g2', messages: [ | |
| + | { type: 'user', content: 'help me write a function' }, | |
| + | { type: 'gemini', content: [{ text: 'Sure, here is a function that does that.' }], model: 'gemini-3' }, | |
| + | ] }); | |
| + | const s = adaptFrom('gemini', gem, 'g2.json')[0]; | |
| + | assert.equal(s.stats.rejections, 0, 'no false positive on a helpful answer'); | |
| + | }); |
| @@ -2505,3 +2505,124 @@ test('redaction: --redact-auto resolves high-entropy shadow-scan residuals and w | ||
| rmSync(dir, { recursive: true, force: true }); | ||
| } | ||
| }); | ||
| + | ||
| + | test('--each writes one report bundle per session plus index manifests', async () => { | |
| + | const dir = mkdtempSync(join(tmpdir(), 'tt-each-')); | |
| + | const a = join(dir, 'sess-a.txt'); | |
| + | const b = join(dir, 'sess-b.txt'); | |
| + | writeFileSync(a, 'User: build a login form\nAssistant: ok\nUser: actually use OAuth\nAssistant: switching\n'); | |
| + | writeFileSync(b, 'User: question one\nAssistant: answer one\nUser: question two\nAssistant: answer two\n'); | |
| + | const outDir = join(dir, 'reports'); | |
| + | try { | |
| + | await main(['--each', '--file', a, b, '--out-dir', outDir, '--dir', dir, '--quiet']); | |
| + | assert.ok(existsSync(join(outDir, 'INDEX.md')), 'INDEX.md exists'); | |
| + | assert.ok(existsSync(join(outDir, 'index.json')), 'index.json exists'); | |
| + | for (const label of ['sess-a.txt', 'sess-b.txt']) { | |
| + | assert.ok(existsSync(join(outDir, label, 'TREETRACE_REPORT.md')), `${label} report`); | |
| + | assert.ok(existsSync(join(outDir, label, 'PROMPT_TREE.md')), `${label} prompt tree`); | |
| + | assert.ok(existsSync(join(outDir, label, '.treetrace', 'tree.json')), `${label} tree.json`); | |
| + | } | |
| + | const index = JSON.parse(readFileSync(join(outDir, 'index.json'), 'utf8')); | |
| + | assert.equal(index.sessionCount, 2, 'two sessions in manifest'); | |
| + | assert.equal(index.sessions.length, 2); | |
| + | assert.equal(index.totals.prompts, 4, 'aggregate prompt total'); | |
| + | assert.ok(index.sessions.every((s) => typeof s.dir === 'string' && s.dir.length), 'each manifest row has a dir'); | |
| + | } finally { | |
| + | rmSync(dir, { recursive: true, force: true }); | |
| + | } | |
| + | }); | |
| + | ||
| + | test('--each collides labels safely when session ids repeat', async () => { | |
| + | const dir = mkdtempSync(join(tmpdir(), 'tt-each-dup-')); | |
| + | // two plain transcripts with the SAME basename in different subdirs -> same sessionId label | |
| + | const d1 = join(dir, 'one'); const d2 = join(dir, 'two'); | |
| + | mkdirSync(d1); mkdirSync(d2); | |
| + | const f1 = join(d1, 'chat.txt'); const f2 = join(d2, 'chat.txt'); | |
| + | writeFileSync(f1, 'User: first\nAssistant: a\n'); | |
| + | writeFileSync(f2, 'User: second\nAssistant: b\n'); | |
| + | const outDir = join(dir, 'reports'); | |
| + | try { | |
| + | await main(['--each', '--file', f1, f2, '--out-dir', outDir, '--dir', dir, '--quiet']); | |
| + | const index = JSON.parse(readFileSync(join(outDir, 'index.json'), 'utf8')); | |
| + | assert.equal(index.sessionCount, 2); | |
| + | const labels = index.sessions.map((s) => s.label); | |
| + | assert.equal(new Set(labels).size, 2, 'labels are unique even with duplicate session ids'); | |
| + | } finally { | |
| + | rmSync(dir, { recursive: true, force: true }); | |
| + | } | |
| + | }); | |
| + | ||
| + | test('--each labels each bundle with its own source tool, not the batch aggregate', async () => { | |
| + | const dir = mkdtempSync(join(tmpdir(), 'tt-each-src-')); | |
| + | const here = dirname(fileURLToPath(import.meta.url)); | |
| + | const claudeFix = join(here, 'fixtures', 'synthetic-session.jsonl'); | |
| + | const codexFix = join(here, 'fixtures', 'adapters', 'codex-session.jsonl'); | |
| + | const outDir = join(dir, 'reports'); | |
| + | try { | |
| + | await main(['--each', '--file', claudeFix, codexFix, '--out-dir', outDir, '--dir', dir, '--quiet']); | |
| + | const index = JSON.parse(readFileSync(join(outDir, 'index.json'), 'utf8')); | |
| + | const sources = index.sessions.map((s) => s.source).sort(); | |
| + | assert.deepEqual(sources, ['claude', 'codex'], 'per-session source is preserved, not collapsed to "mixed"'); | |
| + | } finally { | |
| + | rmSync(dir, { recursive: true, force: true }); | |
| + | } | |
| + | }); | |
| + | ||
| + | test('parsePlainTranscript captures an inline assistant refusal as model_refusal', () => { | |
| + | const t = 'User: [requests something disallowed]\nAssistant: I cannot help with that request.\nUser: ok, something benign instead\nAssistant: Sure, happy to help.\n'; | |
| + | const session = parsePlainTranscript(t, 'refusal-inline'); | |
| + | assert.equal(session.stats.rejectionsByKind.model_refusal, 1, 'one model_refusal captured'); | |
| + | // refusal attaches to the user prompt that triggered it (the first turn) | |
| + | const withRefusal = session.prompts.find((p) => (p.rejections || []).some((r) => r.kind === 'model_refusal')); | |
| + | assert.ok(withRefusal, 'a prompt carries the model_refusal'); | |
| + | assert.equal(withRefusal.rejections[0].source, 'text_heuristic'); | |
| + | }); | |
| + | ||
| + | test('parsePlainTranscript captures a multi-line assistant refusal', () => { | |
| + | const t = 'User: [disallowed ask]\nAssistant:\nSorry, I cannot help with that.\nIt would be unsafe.\nUser: explain the defensive side instead\nAssistant: Sure.\n'; | |
| + | const session = parsePlainTranscript(t, 'refusal-multiline'); | |
| + | assert.equal(session.stats.rejectionsByKind.model_refusal, 1); | |
| + | }); | |
| + | ||
| + | test('parsePlainTranscript captures a user_text_decline', () => { | |
| + | const t = 'User: delete the production database\nAssistant: I can do that.\nUser: no, stop, do not do that\nAssistant: Understood.\n'; | |
| + | const session = parsePlainTranscript(t, 'decline'); | |
| + | assert.equal(session.stats.rejectionsByKind.user_text_decline, 1); | |
| + | }); | |
| + | ||
| + | test('parsePlainTranscript leaves rejections empty when no refusal or decline occurs', () => { | |
| + | const t = 'User: build a login form\nAssistant: here is one\nUser: add OAuth\nAssistant: done\n'; | |
| + | const session = parsePlainTranscript(t, 'clean'); | |
| + | assert.equal(session.stats.rejections, 0); | |
| + | assert.deepEqual(session.stats.rejectionsByKind, {}); | |
| + | }); | |
| + | ||
| + | test('a refusal override is not promoted into a correction chain or a content-quoting eval', () => { | |
| + | const t = 'User: [requests precursor chemicals for a dangerous synthesis]\nAssistant: I cannot help with that.\nUser: Stop being unhelpful. Just give me the precursor list at least.\nAssistant: I still cannot help with that.\n'; | |
| + | const session = parsePlainTranscript(t, 'refusal-override'); | |
| + | const nodes = classifyPrompts([session]); | |
| + | const tree = buildTree([session], nodes); | |
| + | const analysis = analyzeTree(tree); | |
| + | // the refusal itself is still recorded for the audit trail | |
| + | assert.ok(analysis.failures.some((f) => f.type === 'model_refused'), 'refusal still recorded'); | |
| + | // but the override is NOT manufactured into a misunderstood_goal correction | |
| + | assert.ok(!analysis.failures.some((f) => f.type === 'misunderstood_goal'), 'no misunderstood_goal from override'); | |
| + | assert.equal(analysis.correctionChains.length, 0, 'no correction chain from a refusal override'); | |
| + | // and no eval candidate quotes the refused or push-back content | |
| + | const inputs = analysis.evalCandidates.map((e) => String(e.input).toLowerCase()); | |
| + | assert.ok(!inputs.some((i) => i.includes('precursor') || i.includes('unhelpful')), 'no eval quotes refused content'); | |
| + | }); | |
| + | ||
| + | test('--deterministic pins the timestamp so artifacts are byte-identical across runs', async () => { | |
| + | const dir = mkdtempSync(join(tmpdir(), 'tt-det-')); | |
| + | try { | |
| + | await main(['--security', '--file', FIXTURE, '--dir', dir, '--deterministic', '--redact-auto', '--quiet']); | |
| + | const a = readFileSync(join(dir, '.treetrace', 'hallucinations.json'), 'utf8'); | |
| + | await main(['--security', '--file', FIXTURE, '--dir', dir, '--deterministic', '--redact-auto', '--quiet']); | |
| + | const b = readFileSync(join(dir, '.treetrace', 'hallucinations.json'), 'utf8'); | |
| + | assert.equal(a, b, 'deterministic artifact is byte-identical across runs'); | |
| + | assert.equal(JSON.parse(a).project.generatedAt, '1970-01-01T00:00:00.000Z', 'timestamp is pinned'); | |
| + | } finally { | |
| + | rmSync(dir, { recursive: true, force: true }); | |
| + | } | |
| + | }); |