test: evals audit datasets to dev-facing cases (#16433)

denolfe · web-flow · commit f221f6c22baf · 2026-04-30T11:00:10.000-04:00
# Overview

Trims `test/evals/datasets/` so the eval suites measure knowledge a
developer applies while building an application with Payload, not
knowledge a Payload-monorepo contributor needs. Also adds shorthand npm
scripts for running individual eval suites.

## Key Changes

- **Trimmed `conventions/qa.ts` from 10 cases to 1**
- Dropped 9 cases lifted from `CLAUDE.md` (types vs interfaces, boolean
naming, function vs class, translation paths, `afterEach` cleanup,
conventional-commits, dev-server flags, auto-login creds,
single-object-parameter convention).
- Kept the `payload.logger.error` shape case, the only one that
describes a call shape a Payload consumer writes in their own code.

- **Removed `plugins/official/qa.ts`**
- 11 reference-doc QA cases ("what does plugin X do") testing recall,
not application. The borderline MCP-config case is already covered by
`plugins/official/codegen.ts` via real code generation.
- `eval.official-plugins.spec.ts` updated to drop the QA registration;
codegen registration unchanged.

- **Corrected the audience map in `EvalDashboard/audience.ts`**
- `negative` retagged from `maintainers` to `users`. Six of seven
retained `negative` cases are dev-facing (debugging your own broken
config); the map can't split sub-arrays, so `users` is the better
representative tag.
- Removed three category keys (`commits`, `structure`, `testing`) that
no longer appear in any dataset after the conventions trim.

- **Added `test:eval:&lt;suite&gt;` shorthand scripts**
- One per suite (`building-plugins`, `collections`, `config`,
`conventions`, `fields`, `graphql`, `local-api`, `negative`,
`official-plugins`, `rest-api`). Each delegates to the `:skill` variant,
matching the project-wide default.

## Design Decisions

The dividing line is "would a developer consuming `payload` from npm
encounter this?" If no, the case is contributor-only and removed.

Three pre-existing categories were intentionally kept in scope but
untouched:

- `negative/codegen.ts` `negativeInvalidInstructionDataset` is an
eval-pipeline self-test (it verifies `tsc` rejects bad types) and is
preserved as-is.
- `plugins/qa.ts` and `plugins/codegen.ts` stay because developers may
colocate plugins inside their own project structure.
- Other dead audience-map keys (`'access-control'`, `admin`,
`'building-plugins'`, `conventions`, `hooks`, `'official-plugins'`,
`translations`) were dead before this audit and were left to keep the
diff focused.

`conventions/qa.ts` and `eval.conventions.spec.ts` are kept rather than
deleted so the surviving `coding`-category case still runs as a
registered suite.
diff --git a/package.json b/package.json
@@ -122,35 +122,45 @@
     "test:e2e:prod:run:noturbo": "pnpm runts ./test/runE2E.ts --prod --no-turbo",
     "test:eval": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 vitest --run --project eval",
     "test:eval:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 vitest --run --project eval",
+    "test:eval:building-plugins": "pnpm run test:eval:building-plugins:skill",
     "test:eval:building-plugins:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.building-plugins.spec",
     "test:eval:building-plugins:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.building-plugins.spec",
     "test:eval:building-plugins:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.building-plugins.spec",
+    "test:eval:collections": "pnpm run test:eval:collections:skill",
     "test:eval:collections:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.collections.spec",
     "test:eval:collections:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.collections.spec",
     "test:eval:collections:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.collections.spec",
+    "test:eval:config": "pnpm run test:eval:config:skill",
     "test:eval:config:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.config.spec",
     "test:eval:config:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.config.spec",
     "test:eval:config:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.config.spec",
+    "test:eval:conventions": "pnpm run test:eval:conventions:skill",
     "test:eval:conventions:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.conventions.spec",
     "test:eval:conventions:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.conventions.spec",
     "test:eval:conventions:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.conventions.spec",
+    "test:eval:fields": "pnpm run test:eval:fields:skill",
     "test:eval:fields:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.fields.spec",
     "test:eval:fields:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.fields.spec",
     "test:eval:fields:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.fields.spec",
+    "test:eval:graphql": "pnpm run test:eval:graphql:skill",
     "test:eval:graphql:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.graphql.spec",
     "test:eval:graphql:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.graphql.spec",
     "test:eval:graphql:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.graphql.spec",
+    "test:eval:local-api": "pnpm run test:eval:local-api:skill",
     "test:eval:local-api:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.local-api.spec",
     "test:eval:local-api:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.local-api.spec",
     "test:eval:local-api:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.local-api.spec",
     "test:eval:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 vitest --run --project eval",
+    "test:eval:negative": "pnpm run test:eval:negative:skill",
     "test:eval:negative:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.negative.spec",
     "test:eval:negative:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.negative.spec",
     "test:eval:negative:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.negative.spec",
+    "test:eval:official-plugins": "pnpm run test:eval:official-plugins:skill",
     "test:eval:official-plugins:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.official-plugins.spec",
     "test:eval:official-plugins:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.official-plugins.spec",
     "test:eval:official-plugins:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.official-plugins.spec",
     "test:eval:report": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 vitest --run --project eval --reporter=default --reporter=html --outputFile.html=test/evals/eval-results/report.html",
+    "test:eval:rest-api": "pnpm run test:eval:rest-api:skill",
     "test:eval:rest-api:baseline": "cross-env EVAL_VARIANT=baseline NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.rest-api.spec",
     "test:eval:rest-api:low-power": "cross-env EVAL_VARIANT=low-power NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.rest-api.spec",
     "test:eval:rest-api:skill": "cross-env NODE_OPTIONS=\"--no-deprecation --no-experimental-strip-types\" NODE_NO_WARNINGS=1 pnpm exec vitest --run --project eval eval.rest-api.spec",
diff --git a/test/evals/components/EvalDashboard/audience.ts b/test/evals/components/EvalDashboard/audience.ts
@@ -13,19 +13,16 @@ const CATEGORY_AUDIENCES: Record<string, Audience[]> = {
   admin: ['admins', 'users'],
   'building-plugins': ['maintainers', 'users'],
   collections: ['users'],
-  commits: ['maintainers'],
   config: ['users'],
   conventions: ['maintainers'],
   fields: ['users'],
   graphql: ['users'],
   hooks: ['users'],
   'local-api': ['users'],
-  negative: ['maintainers'],
+  negative: ['users'],
   'official-plugins': ['users'],
   plugins: ['users'],
   'rest-api': ['users'],
-  structure: ['maintainers'],
-  testing: ['maintainers'],
   translations: ['maintainers'],
 }
 
diff --git a/test/evals/datasets/conventions/qa.ts b/test/evals/datasets/conventions/qa.ts
@@ -3,61 +3,10 @@ import type { EvalCase } from '../../types.js'
 export type { EvalCase }
 
 export const conventionsQADataset: EvalCase[] = [
-  {
-    input:
-      'In Payload, should you prefer types or interfaces when defining TypeScript data shapes?',
-    expected: 'types should be preferred over interfaces, except when extending external types',
-    category: 'coding',
-  },
-  {
-    input: 'What naming convention should be used for boolean variables in Payload code?',
-    expected:
-      'booleans should be prefixed with is, has, can, or should — for example isValid, hasData, canEdit, shouldRun',
-    category: 'coding',
-  },
-  {
-    input: 'Should Payload code prefer functions or classes?',
-    expected: 'functions are preferred over classes; classes are only used for errors and adapters',
-    category: 'coding',
-  },
   {
     input: 'When passing an error to payload.logger.error, what is the correct format?',
     expected:
       'use an object with msg and err keys, like payload.logger.error({ msg: "message", err: error }); do not pass the error as a second argument',
     category: 'coding',
   },
-  {
-    input: 'Where do translation files live in the Payload monorepo?',
-    expected: 'packages/translations/src/languages/',
-    category: 'structure',
-  },
-  {
-    input: 'What is the pattern for cleaning up database records created during a Payload test?',
-    expected:
-      'tests must delete any records they create; use afterEach with a shared array of created IDs to centralize cleanup, then clear the array',
-    category: 'testing',
-  },
-  {
-    input: 'What format should the first commit on a new Payload branch follow? Give an example.',
-    expected:
-      'conventional commits format: <type>(<scope>): <lowercase title> — for example feat(db-mongodb): add support for transactions or fix(ui): json field type ignoring editorOptions',
-    category: 'commits',
-  },
-  {
-    input: 'How do you start the Payload dev server using a specific test config directory?',
-    expected:
-      'run pnpm run dev <directory_name>, for example pnpm run dev fields loads test/fields/config.ts',
-    category: 'development',
-  },
-  {
-    input: 'What are the default auto-login credentials when running the Payload dev server?',
-    expected: 'email dev@payloadcms.com and password test',
-    category: 'development',
-  },
-  {
-    input:
-      'In Payload functions, should parameters be passed as individual arguments or as a single object?',
-    expected: 'prefer single object parameters to improve backwards-compatibility',
-    category: 'coding',
-  },
 ]
diff --git a/test/evals/datasets/plugins/official/qa.ts b/test/evals/datasets/plugins/official/qa.ts
diff --git a/test/evals/eval.official-plugins.spec.ts b/test/evals/eval.official-plugins.spec.ts
@@ -1,14 +1,12 @@
 import { describe } from 'vitest'
 
 import { pluginsOfficialCodegenDataset } from './datasets/plugins/official/codegen.js'
-import { pluginsOfficialQADataset } from './datasets/plugins/official/qa.js'
-import { registerCodegenCases, registerQACases } from './suites/helpers.js'
+import { registerCodegenCases } from './suites/helpers.js'
 import { resolveVariantOptions } from './variantOptions.js'
 
 const options = resolveVariantOptions()
 const { labelSuffix = '' } = options
 
 describe(`Official Plugins${labelSuffix}`, () => {
-  registerQACases(pluginsOfficialQADataset, 'Official Plugins: QA', options)
   registerCodegenCases(pluginsOfficialCodegenDataset, 'Official Plugins: Codegen', options)
 })