diff --git a/reviewer-calibration-assistant/demo/generate-demo.js b/reviewer-calibration-assistant/demo/generate-demo.js new file mode 100644 index 00000000..2dd5116c --- /dev/null +++ b/reviewer-calibration-assistant/demo/generate-demo.js @@ -0,0 +1,50 @@ +import { mkdirSync, writeFileSync } from "node:fs"; +import { evaluateReviewerCalibration, renderMarkdownReport } from "../src/calibration.js"; + +const packet = { + projectId: "proj-neuro-118", + manuscriptId: "ms-gapfinder-204", + domainTemplate: "neuroscience", + reviewMode: "public", + reviews: [ + { + reviewerId: "rev-stat", + template: "neuroscience", + recommendation: "major_revision", + scores: { clarity: 3, rigor: 2, novelty: 4, reproducibility: 2 }, + claimChecks: [ + { claimId: "claim-primary-endpoint", evidenceIds: ["fig-2", "table-1"] }, + { claimId: "claim-causal-language", evidenceIds: ["methods-4"] }, + { claimId: "claim-replication", evidenceIds: [] } + ] + }, + { + reviewerId: "rev-domain", + template: "neuroscience", + recommendation: "minor_revision", + scores: { clarity: 4, rigor: 4, novelty: 4, reproducibility: 3 }, + claimChecks: [ + { claimId: "claim-primary-endpoint", evidenceIds: ["fig-2"] }, + { claimId: "claim-causal-language", evidenceIds: [] }, + { claimId: "claim-replication", evidenceIds: ["supp-1"] } + ] + }, + { + reviewerId: "rev-methods", + template: "clinical_trials", + recommendation: "reject", + scores: { clarity: 2, rigor: 1, novelty: 3, reproducibility: 1 }, + claimChecks: [ + { claimId: "claim-primary-endpoint", evidenceIds: ["table-1"] }, + { claimId: "claim-causal-language", evidenceIds: [] }, + { claimId: "claim-replication", evidenceIds: [] } + ] + } + ] +}; + +mkdirSync("reports", { recursive: true }); +const result = evaluateReviewerCalibration(packet); +writeFileSync("reports/reviewer-calibration-report.json", `${JSON.stringify(result, null, 2)}\n`); +writeFileSync("reports/reviewer-calibration-report.md", renderMarkdownReport(result)); +console.log(JSON.stringify(result, null, 2)); diff --git a/reviewer-calibration-assistant/demo/write-svg.js b/reviewer-calibration-assistant/demo/write-svg.js new file mode 100644 index 00000000..8f2f04f1 --- /dev/null +++ b/reviewer-calibration-assistant/demo/write-svg.js @@ -0,0 +1,46 @@ +import { readFileSync, writeFileSync } from "node:fs"; + +const result = JSON.parse(readFileSync("reports/reviewer-calibration-report.json", "utf8")); +const blockerLines = result.blockers.slice(0, 5).map((blocker, index) => + `${blocker}` +).join("\n"); + +const svg = ` + + + Reviewer Calibration Assistant + AI-Powered Research Assistant Suite evidence artifact + + + Decision + ${result.decision} + + + Agreement + ${result.pairwiseAgreement} + + + Consensus + ${result.consensusAction} + + + Blockers + ${result.blockers.length} + + Calibration blockers + ${blockerLines} + Synthetic data only. No private manuscripts, external AI APIs, credentials, or payment systems. + + +`; + +writeFileSync("reports/reviewer-calibration-summary.svg", svg); diff --git a/reviewer-calibration-assistant/package.json b/reviewer-calibration-assistant/package.json new file mode 100644 index 00000000..29bcab62 --- /dev/null +++ b/reviewer-calibration-assistant/package.json @@ -0,0 +1,12 @@ +{ + "name": "reviewer-calibration-assistant", + "version": "1.0.0", + "description": "Deterministic reviewer calibration and inter-rater agreement assistant for SCIBASE AI review workflows.", + "type": "module", + "scripts": { + "test": "node test/calibration.test.js", + "demo": "node demo/generate-demo.js", + "demo:video": "npm run demo && node demo/write-svg.js && qlmanage -t -s 1280 -o reports reports/reviewer-calibration-summary.svg >/dev/null && ffmpeg -y -loop 1 -framerate 30 -t 8 -i reports/reviewer-calibration-summary.svg.png -vf format=yuv420p reports/reviewer-calibration-demo.mp4" + }, + "license": "MIT" +} diff --git a/reviewer-calibration-assistant/readme.md b/reviewer-calibration-assistant/readme.md new file mode 100644 index 00000000..951b21ee --- /dev/null +++ b/reviewer-calibration-assistant/readme.md @@ -0,0 +1,36 @@ +# Reviewer Calibration Assistant + +This module adds a deterministic reviewer calibration and inter-rater agreement assistant for the SCIBASE AI-Powered Research Assistant Suite. + +## Scope + +The assistant evaluates review packets before AI-generated peer-review summaries, reputation updates, or author-facing recommendations are released. It focuses on a distinct quality gate not covered by the existing broad assistant, reproducibility checker, research-gap finder, structured-abstract, model-assumption, dependency, external-validity, or evidence-binder slices. + +It checks: + +- pairwise reviewer agreement across clarity, rigor, novelty, reproducibility, and recommendation; +- recommendation consensus ratio; +- severe score spread by scoring dimension; +- domain-template drift; +- evidence-anchor coverage for reviewer claim checks; +- duplicate reviewer records; +- whether disputed packets should remain double-blind during calibration. + +## Local Validation + +```sh +npm test +npm run demo +npm run demo:video +``` + +The demo writes reviewer-ready evidence artifacts to `reports/`: + +- `reviewer-calibration-report.json` +- `reviewer-calibration-report.md` +- `reviewer-calibration-summary.svg` +- `reviewer-calibration-demo.mp4` + +## Boundaries + +The implementation is dependency-free and uses synthetic data only. It does not call external AI APIs, ingest private manuscripts, access credentials, or integrate with payment systems. diff --git a/reviewer-calibration-assistant/reports/reviewer-calibration-demo.mp4 b/reviewer-calibration-assistant/reports/reviewer-calibration-demo.mp4 new file mode 100644 index 00000000..21c9a222 Binary files /dev/null and b/reviewer-calibration-assistant/reports/reviewer-calibration-demo.mp4 differ diff --git a/reviewer-calibration-assistant/reports/reviewer-calibration-report.json b/reviewer-calibration-assistant/reports/reviewer-calibration-report.json new file mode 100644 index 00000000..f69dc99f --- /dev/null +++ b/reviewer-calibration-assistant/reports/reviewer-calibration-report.json @@ -0,0 +1,89 @@ +{ + "projectId": "proj-neuro-118", + "manuscriptId": "ms-gapfinder-204", + "reviewerCount": 3, + "reviewMode": "public", + "consensusAction": "major_revision", + "consensusRatio": 0.333, + "pairwiseAgreement": 0.667, + "pairwiseDetails": [ + { + "reviewers": [ + "rev-stat", + "rev-domain" + ], + "agreement": 0.75 + }, + { + "reviewers": [ + "rev-stat", + "rev-methods" + ], + "agreement": 0.75 + }, + { + "reviewers": [ + "rev-domain", + "rev-methods" + ], + "agreement": 0.5 + } + ], + "scoreDivergence": [ + { + "field": "clarity", + "spread": 2 + }, + { + "field": "rigor", + "spread": 3 + }, + { + "field": "reproducibility", + "spread": 2 + } + ], + "templateDrift": [ + { + "reviewerId": "rev-methods", + "expectedTemplate": "neuroscience", + "actualTemplate": "clinical_trials" + } + ], + "evidenceGaps": [ + { + "reviewerId": "rev-stat", + "coverage": 0.667 + }, + { + "reviewerId": "rev-domain", + "coverage": 0.667 + }, + { + "reviewerId": "rev-methods", + "coverage": 0.333 + } + ], + "blindModeRisk": { + "reviewMode": "public", + "reason": "disputed review packet should remain double-blind until calibration is complete" + }, + "duplicateReviewers": [], + "blockers": [ + "low_inter_rater_agreement", + "weak_recommendation_consensus", + "severe_score_divergence", + "domain_template_drift", + "insufficient_evidence_coverage", + "blind_mode_escalation_required" + ], + "decision": "calibrate_before_release", + "nextActions": [ + "Run reviewer calibration on divergent scoring dimensions before author-facing release.", + "Request adjudication because recommendation consensus is below threshold.", + "Ask reviewers to justify high-spread score dimensions with evidence anchors.", + "Normalize all reviews to the configured domain template.", + "Require evidence anchors for unsupported claim checks.", + "Keep disputed packets double-blind until calibration is resolved." + ] +} diff --git a/reviewer-calibration-assistant/reports/reviewer-calibration-report.md b/reviewer-calibration-assistant/reports/reviewer-calibration-report.md new file mode 100644 index 00000000..e1eb1a3d --- /dev/null +++ b/reviewer-calibration-assistant/reports/reviewer-calibration-report.md @@ -0,0 +1,26 @@ +# Reviewer Calibration Assistant Report + +| Field | Value | +| --- | --- | +| Project | proj-neuro-118 | +| Manuscript | ms-gapfinder-204 | +| Reviewers | 3 | +| Decision | calibrate_before_release | +| Pairwise agreement | 0.667 | +| Consensus action | major_revision (0.333) | + +## Blockers +- low_inter_rater_agreement +- weak_recommendation_consensus +- severe_score_divergence +- domain_template_drift +- insufficient_evidence_coverage +- blind_mode_escalation_required + +## Required Actions +- Run reviewer calibration on divergent scoring dimensions before author-facing release. +- Request adjudication because recommendation consensus is below threshold. +- Ask reviewers to justify high-spread score dimensions with evidence anchors. +- Normalize all reviews to the configured domain template. +- Require evidence anchors for unsupported claim checks. +- Keep disputed packets double-blind until calibration is resolved. diff --git a/reviewer-calibration-assistant/reports/reviewer-calibration-summary.svg b/reviewer-calibration-assistant/reports/reviewer-calibration-summary.svg new file mode 100644 index 00000000..c590c0c5 --- /dev/null +++ b/reviewer-calibration-assistant/reports/reviewer-calibration-summary.svg @@ -0,0 +1,40 @@ + + + + Reviewer Calibration Assistant + AI-Powered Research Assistant Suite evidence artifact + + + Decision + calibrate_before_release + + + Agreement + 0.667 + + + Consensus + major_revision + + + Blockers + 6 + + Calibration blockers + low_inter_rater_agreement +weak_recommendation_consensus +severe_score_divergence +domain_template_drift +insufficient_evidence_coverage + Synthetic data only. No private manuscripts, external AI APIs, credentials, or payment systems. + + diff --git a/reviewer-calibration-assistant/src/calibration.js b/reviewer-calibration-assistant/src/calibration.js new file mode 100644 index 00000000..559637f1 --- /dev/null +++ b/reviewer-calibration-assistant/src/calibration.js @@ -0,0 +1,203 @@ +const DEFAULT_THRESHOLDS = { + minimumPairwiseAgreement: 0.68, + severeScoreSpread: 2, + actionConsensusRatio: 0.67, + requiredEvidenceCoverage: 0.75, + requireBlindModeForDisputedReviews: true +}; + +const ACTION_RANK = { + accept: 4, + minor_revision: 3, + major_revision: 2, + reject: 1 +}; + +const SCORE_FIELDS = ["clarity", "rigor", "novelty", "reproducibility"]; + +function clamp(value, min, max) { + return Math.max(min, Math.min(max, value)); +} + +function unique(values) { + return [...new Set(values)]; +} + +function mode(values) { + const counts = new Map(); + for (const value of values) counts.set(value, (counts.get(value) || 0) + 1); + return [...counts.entries()].sort((a, b) => b[1] - a[1] || String(a[0]).localeCompare(String(b[0])))[0]; +} + +function average(values) { + return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0; +} + +function scoreSpread(reviews, field) { + const scores = reviews.map((review) => Number(review.scores?.[field] ?? 0)); + return Math.max(...scores) - Math.min(...scores); +} + +function reviewVector(review) { + const scoreValues = SCORE_FIELDS.map((field) => Number(review.scores?.[field] ?? 0)); + const actionValue = ACTION_RANK[review.recommendation] ?? 0; + return [...scoreValues, actionValue]; +} + +function pairAgreement(left, right) { + const leftVector = reviewVector(left); + const rightVector = reviewVector(right); + const distances = leftVector.map((value, index) => Math.abs(value - rightVector[index]) / 4); + return clamp(1 - average(distances), 0, 1); +} + +function buildPairs(reviews) { + const pairs = []; + for (let i = 0; i < reviews.length; i += 1) { + for (let j = i + 1; j < reviews.length; j += 1) { + pairs.push({ + reviewers: [reviews[i].reviewerId, reviews[j].reviewerId], + agreement: Number(pairAgreement(reviews[i], reviews[j]).toFixed(3)) + }); + } + } + return pairs; +} + +function evidenceCoverage(review) { + const claims = review.claimChecks || []; + if (claims.length === 0) return 0; + const supported = claims.filter((claim) => claim.evidenceIds?.length > 0).length; + return supported / claims.length; +} + +function detectDomainTemplateDrift(packet) { + const expected = packet.domainTemplate; + return packet.reviews + .filter((review) => review.template !== expected) + .map((review) => ({ + reviewerId: review.reviewerId, + expectedTemplate: expected, + actualTemplate: review.template || "missing" + })); +} + +function detectEvidenceGaps(packet, thresholds) { + return packet.reviews + .map((review) => ({ + reviewerId: review.reviewerId, + coverage: Number(evidenceCoverage(review).toFixed(3)) + })) + .filter((review) => review.coverage < thresholds.requiredEvidenceCoverage); +} + +function detectScoreDivergence(reviews, thresholds) { + return SCORE_FIELDS + .map((field) => ({ field, spread: scoreSpread(reviews, field) })) + .filter((item) => item.spread >= thresholds.severeScoreSpread); +} + +function detectBlindModeRisk(packet, thresholds, scoreDivergence, pairwiseAgreement) { + if (!thresholds.requireBlindModeForDisputedReviews) return null; + if (packet.reviewMode === "double_blind") return null; + if (scoreDivergence.length === 0 && pairwiseAgreement >= thresholds.minimumPairwiseAgreement) return null; + return { + reviewMode: packet.reviewMode || "missing", + reason: "disputed review packet should remain double-blind until calibration is complete" + }; +} + +export function evaluateReviewerCalibration(packet, customThresholds = {}) { + const thresholds = { ...DEFAULT_THRESHOLDS, ...customThresholds }; + if (!packet || !Array.isArray(packet.reviews) || packet.reviews.length < 2) { + throw new Error("Reviewer calibration requires at least two review records."); + } + + const duplicateReviewers = unique( + packet.reviews + .map((review) => review.reviewerId) + .filter((reviewerId, index, all) => all.indexOf(reviewerId) !== index) + ); + const pairs = buildPairs(packet.reviews); + const pairwiseAgreement = Number(average(pairs.map((pair) => pair.agreement)).toFixed(3)); + const [consensusAction, consensusCount] = mode(packet.reviews.map((review) => review.recommendation)); + const consensusRatio = Number((consensusCount / packet.reviews.length).toFixed(3)); + const scoreDivergence = detectScoreDivergence(packet.reviews, thresholds); + const templateDrift = detectDomainTemplateDrift(packet); + const evidenceGaps = detectEvidenceGaps(packet, thresholds); + const blindModeRisk = detectBlindModeRisk(packet, thresholds, scoreDivergence, pairwiseAgreement); + + const blockers = []; + if (duplicateReviewers.length) blockers.push("duplicate_reviewer_records"); + if (pairwiseAgreement < thresholds.minimumPairwiseAgreement) blockers.push("low_inter_rater_agreement"); + if (consensusRatio < thresholds.actionConsensusRatio) blockers.push("weak_recommendation_consensus"); + if (scoreDivergence.length) blockers.push("severe_score_divergence"); + if (templateDrift.length) blockers.push("domain_template_drift"); + if (evidenceGaps.length) blockers.push("insufficient_evidence_coverage"); + if (blindModeRisk) blockers.push("blind_mode_escalation_required"); + + return { + projectId: packet.projectId, + manuscriptId: packet.manuscriptId, + reviewerCount: packet.reviews.length, + reviewMode: packet.reviewMode, + consensusAction, + consensusRatio, + pairwiseAgreement, + pairwiseDetails: pairs, + scoreDivergence, + templateDrift, + evidenceGaps, + blindModeRisk, + duplicateReviewers, + blockers, + decision: blockers.length ? "calibrate_before_release" : "release_review_summary", + nextActions: buildNextActions(blockers) + }; +} + +function buildNextActions(blockers) { + const actionMap = { + duplicate_reviewer_records: "Remove duplicate reviewer entries before reputation or review-summary updates.", + low_inter_rater_agreement: "Run reviewer calibration on divergent scoring dimensions before author-facing release.", + weak_recommendation_consensus: "Request adjudication because recommendation consensus is below threshold.", + severe_score_divergence: "Ask reviewers to justify high-spread score dimensions with evidence anchors.", + domain_template_drift: "Normalize all reviews to the configured domain template.", + insufficient_evidence_coverage: "Require evidence anchors for unsupported claim checks.", + blind_mode_escalation_required: "Keep disputed packets double-blind until calibration is resolved." + }; + return blockers.map((blocker) => actionMap[blocker]); +} + +export function renderMarkdownReport(result) { + const rows = [ + ["Project", result.projectId], + ["Manuscript", result.manuscriptId], + ["Reviewers", result.reviewerCount], + ["Decision", result.decision], + ["Pairwise agreement", result.pairwiseAgreement], + ["Consensus action", `${result.consensusAction} (${result.consensusRatio})`] + ]; + const summary = rows.map(([label, value]) => `| ${label} | ${value} |`).join("\n"); + const blockers = result.blockers.length + ? result.blockers.map((blocker) => `- ${blocker}`).join("\n") + : "- none"; + const actions = result.nextActions.length + ? result.nextActions.map((action) => `- ${action}`).join("\n") + : "- Review packet can be released."; + + return [ + "# Reviewer Calibration Assistant Report", + "", + "| Field | Value |", + "| --- | --- |", + summary, + "", + "## Blockers", + blockers, + "", + "## Required Actions", + actions, + "" + ].join("\n"); +} diff --git a/reviewer-calibration-assistant/test/calibration.test.js b/reviewer-calibration-assistant/test/calibration.test.js new file mode 100644 index 00000000..a121b7bd --- /dev/null +++ b/reviewer-calibration-assistant/test/calibration.test.js @@ -0,0 +1,94 @@ +import assert from "node:assert/strict"; +import { evaluateReviewerCalibration, renderMarkdownReport } from "../src/calibration.js"; + +const basePacket = { + projectId: "proj-crispr-042", + manuscriptId: "ms-review-17", + domainTemplate: "clinical_trials", + reviewMode: "double_blind", + reviews: [ + { + reviewerId: "rev-a", + template: "clinical_trials", + recommendation: "major_revision", + scores: { clarity: 3, rigor: 2, novelty: 3, reproducibility: 2 }, + claimChecks: [ + { claimId: "c1", evidenceIds: ["e1"] }, + { claimId: "c2", evidenceIds: ["e2"] } + ] + }, + { + reviewerId: "rev-b", + template: "clinical_trials", + recommendation: "major_revision", + scores: { clarity: 3, rigor: 2, novelty: 3, reproducibility: 2 }, + claimChecks: [ + { claimId: "c1", evidenceIds: ["e1"] }, + { claimId: "c2", evidenceIds: ["e2"] } + ] + }, + { + reviewerId: "rev-c", + template: "clinical_trials", + recommendation: "major_revision", + scores: { clarity: 4, rigor: 3, novelty: 3, reproducibility: 3 }, + claimChecks: [ + { claimId: "c1", evidenceIds: ["e1"] }, + { claimId: "c2", evidenceIds: ["e2"] } + ] + } + ] +}; + +const riskyPacket = { + ...basePacket, + reviewMode: "public", + reviews: [ + basePacket.reviews[0], + { + reviewerId: "rev-b", + template: "molecular_biology", + recommendation: "accept", + scores: { clarity: 5, rigor: 5, novelty: 5, reproducibility: 5 }, + claimChecks: [ + { claimId: "c1", evidenceIds: [] }, + { claimId: "c2", evidenceIds: [] } + ] + }, + { + reviewerId: "rev-c", + template: "clinical_trials", + recommendation: "reject", + scores: { clarity: 1, rigor: 1, novelty: 2, reproducibility: 1 }, + claimChecks: [ + { claimId: "c1", evidenceIds: ["e1"] }, + { claimId: "c2", evidenceIds: [] } + ] + } + ] +}; + +const ready = evaluateReviewerCalibration(basePacket); +assert.equal(ready.decision, "release_review_summary"); +assert.equal(ready.blockers.length, 0); +assert.equal(ready.consensusAction, "major_revision"); +assert.ok(ready.pairwiseAgreement >= 0.8); + +const risky = evaluateReviewerCalibration(riskyPacket); +assert.equal(risky.decision, "calibrate_before_release"); +assert.ok(risky.blockers.includes("low_inter_rater_agreement")); +assert.ok(risky.blockers.includes("severe_score_divergence")); +assert.ok(risky.blockers.includes("domain_template_drift")); +assert.ok(risky.blockers.includes("insufficient_evidence_coverage")); +assert.ok(risky.blockers.includes("blind_mode_escalation_required")); +assert.equal(risky.templateDrift[0].reviewerId, "rev-b"); +assert.equal(risky.evidenceGaps.length, 2); + +assert.throws(() => evaluateReviewerCalibration({ reviews: [basePacket.reviews[0]] }), /at least two/); + +const report = renderMarkdownReport(risky); +assert.match(report, /Reviewer Calibration Assistant Report/); +assert.match(report, /calibrate_before_release/); +assert.match(report, /low_inter_rater_agreement/); + +console.log("reviewer calibration assistant tests passed");