diff --git a/src/commands/query.ts b/src/commands/query.ts index d514c69..99cff2f 100644 --- a/src/commands/query.ts +++ b/src/commands/query.ts @@ -1,5 +1,6 @@ import { Cli, z } from 'incur' import { createClient, requireApiKey } from '../lib/client' +import { stripTrailingFormatClause } from '../lib/sql' export const query = Cli.create('query', { description: 'SQL analytics query commands', @@ -8,7 +9,10 @@ export const query = Cli.create('query', { export function queryRunRun(sql: string) { requireApiKey() const client = createClient() - return client.post('/v0/query/', { query: sql }) + // The API wraps the query in a paginating subquery with its own + // `FORMAT JSON`. ClickHouse forbids a `FORMAT` clause inside a subquery, so + // strip any trailing `FORMAT`/semicolon before sending to avoid a 400. + return client.post('/v0/query/', { query: stripTrailingFormatClause(sql) }) } query.command('run', { diff --git a/src/lib/sql.ts b/src/lib/sql.ts new file mode 100644 index 0000000..f16877b --- /dev/null +++ b/src/lib/sql.ts @@ -0,0 +1,159 @@ +/** + * SQL helpers for the query command. + * + * The Formo query API wraps the SQL you submit so it can paginate the result + * set and force a machine-readable response: + * + * SELECT * FROM () LIMIT FORMAT JSON + * + * ClickHouse does not allow a `FORMAT` clause inside a subquery, so if your + * query ends in its own `FORMAT` clause the wrapped statement becomes: + * + * SELECT * FROM (SELECT ... FORMAT CSV) LIMIT 100 FORMAT JSON + * + * which ClickHouse rejects with a 400. A trailing `FORMAT` (or a trailing + * semicolon) can never take effect through this endpoint anyway — the outer + * `FORMAT JSON` always wins, and output shaping is the CLI's `--format` job — + * so we remove it before sending and let the server wrap a clean query. + */ + +/** + * Strip a trailing, top-level `FORMAT ` clause and any trailing + * semicolons from a SQL statement. + * + * The scan is aware of string literals, quoted identifiers, and comments, so + * `FORMAT`-looking text inside them is never mistaken for a real clause. The + * match is anchored to the end of the statement, so a `FORMAT` nested inside + * parentheses (a subquery) or part of an identifier/function such as + * `formatDateTime(...)` — or a column aliased `format` — is left untouched. + * + * Returns the original input unchanged when there is nothing to strip. + */ +export function stripTrailingFormatClause(sql: string): string { + if (!sql) return sql + + const masked = maskLiterals(sql) + // A genuine trailing FORMAT clause: the keyword `format` preceded by a word + // boundary (so `formatDateTime` or a column aliased `format` is safe), + // followed by exactly one identifier (the format name) and nothing else. + const formatClause = /(^|[^A-Za-z0-9_])format\s+[A-Za-z_][A-Za-z0-9_]*\s*$/i + + let end = sql.length + let didStrip = false + // Peel top-level semicolons and a trailing FORMAT clause repeatedly so any + // ordering collapses to the bare query, e.g. `... FORMAT CSV;`, + // `...; FORMAT CSV`, or `... FORMAT CSV ;`. Trailing whitespace and comments + // are only ever skipped to *look* past them — never removed on their own. + for (;;) { + let e = end + while (e > 0 && /\s/.test(masked[e - 1])) e-- + if (e === 0) break + + if (masked[e - 1] === ';') { + end = e - 1 + didStrip = true + continue + } + + const match = formatClause.exec(masked.slice(0, e)) + if (match) { + // Cut at the `format` keyword, after the leading word-boundary char. + end = match.index + match[1].length + didStrip = true + continue + } + + break + } + + if (!didStrip) return sql + + // Tidy the real whitespace now left dangling where the clause used to be. + // Comments are not whitespace, so any genuine comment is preserved. + const stripped = sql.slice(0, end).replace(/\s+$/, '') + // Never manufacture an empty query from non-empty input (degenerate inputs + // such as a bare `FORMAT JSON`): let the original surface its own error. + return stripped === '' ? sql : stripped +} + +/** + * Return a copy of `sql` with the *contents* of string literals, quoted + * identifiers, and comments replaced by spaces, preserving the original length + * so indices stay aligned with the source. Quote delimiters are kept; comments + * are blanked entirely. This lets the clause scanner reason about real SQL code + * without tripping over keywords that merely appear inside literals or comments. + */ +function maskLiterals(sql: string): string { + const out: string[] = [] + const n = sql.length + let i = 0 + + while (i < n) { + const c = sql[i] + const next = i + 1 < n ? sql[i + 1] : '' + + // Line comment: -- ... or # ... to end of line. ClickHouse accepts `#` + // and `#!` line comments for MySQL compatibility. + if ((c === '-' && next === '-') || c === '#') { + while (i < n && sql[i] !== '\n') { + out.push(' ') + i++ + } + continue + } + + // Block comment: /* ... */ + if (c === '/' && next === '*') { + out.push(' ', ' ') + i += 2 + while (i < n && !(sql[i] === '*' && sql[i + 1] === '/')) { + out.push(' ') + i++ + } + if (i < n) { + out.push(' ', ' ') + i += 2 + } + continue + } + + // String literal or quoted identifier: '...', "...", `...` + if (c === "'" || c === '"' || c === '`') { + const quote = c + out.push(quote) + i++ + while (i < n) { + const d = sql[i] + // Backslash escapes are honored inside single-quoted strings. + if (d === '\\' && quote === "'") { + out.push(' ') + i++ + if (i < n) { + out.push(' ') + i++ + } + continue + } + if (d === quote) { + // A doubled delimiter is an escaped quote, not the terminator. + if (i + 1 < n && sql[i + 1] === quote) { + out.push(' ', ' ') + i += 2 + continue + } + out.push(quote) + i++ + break + } + out.push(' ') + i++ + } + continue + } + + out.push(c) + i++ + } + + return out.join('') +} diff --git a/test/lib/sql.test.ts b/test/lib/sql.test.ts new file mode 100644 index 0000000..909c13f --- /dev/null +++ b/test/lib/sql.test.ts @@ -0,0 +1,168 @@ +import { expect } from 'chai'; +import { stripTrailingFormatClause } from '../../src/lib/sql'; + +/** + * Mirror what the API does to a submitted query: wrap it in a paginating + * subquery with a single outer FORMAT JSON. Used to assert the stripped query + * produces a statement with no FORMAT inside the parentheses. + */ +function wrap(inner: string): string { + return `SELECT * FROM (${inner}) LIMIT 100 FORMAT JSON`; +} + +/** Count real `FORMAT ` clauses (ignores formatDateTime, etc.). */ +function countFormatClauses(sql: string): number { + const matches = sql.match(/\bformat\s+[A-Za-z_][A-Za-z0-9_]*/gi); + return matches ? matches.length : 0; +} + +describe('lib/sql / stripTrailingFormatClause', function () { + describe('strips a trailing top-level FORMAT clause', function () { + it('removes FORMAT CSV', function () { + expect(stripTrailingFormatClause('SELECT * FROM events FORMAT CSV')).to.equal( + 'SELECT * FROM events', + ); + }); + + it('removes FORMAT CSVWithNames', function () { + expect( + stripTrailingFormatClause('SELECT a, b FROM t FORMAT CSVWithNames'), + ).to.equal('SELECT a, b FROM t'); + }); + + it('removes FORMAT JSON', function () { + expect(stripTrailingFormatClause('SELECT 1 FORMAT JSON')).to.equal('SELECT 1'); + }); + + it('is case-insensitive on the keyword and name', function () { + expect(stripTrailingFormatClause('SELECT 1 format json')).to.equal('SELECT 1'); + expect(stripTrailingFormatClause('SELECT 1 Format JSONEachRow')).to.equal( + 'SELECT 1', + ); + }); + + it('handles newlines and extra whitespace before FORMAT', function () { + expect( + stripTrailingFormatClause('SELECT 1\n FROM t\nFORMAT TabSeparated'), + ).to.equal('SELECT 1\n FROM t'); + }); + }); + + describe('handles trailing semicolons', function () { + it('removes a bare trailing semicolon', function () { + expect(stripTrailingFormatClause('SELECT 1;')).to.equal('SELECT 1'); + }); + + it('removes a semicolon after a FORMAT clause', function () { + expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV;')).to.equal('SELECT 1'); + }); + + it('removes a FORMAT clause that follows a semicolon', function () { + expect( + stripTrailingFormatClause('SELECT x FROM t ORDER BY x; FORMAT CSV'), + ).to.equal('SELECT x FROM t ORDER BY x'); + }); + + it('removes whitespace and multiple trailing semicolons around FORMAT', function () { + expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV ; ')).to.equal('SELECT 1'); + }); + }); + + describe('does not truncate FORMAT-like identifiers (no false positives)', function () { + it('keeps a formatDateTime(...) call with no trailing clause', function () { + const sql = 'SELECT formatDateTime(ts, \'%Y-%m-%d\') AS day FROM events'; + expect(stripTrailingFormatClause(sql)).to.equal(sql); + }); + + it('strips only the real clause, keeping formatDateTime(...) intact', function () { + expect( + stripTrailingFormatClause( + 'SELECT formatDateTime(ts) AS day, count() FROM events GROUP BY day FORMAT CSV', + ), + ).to.equal('SELECT formatDateTime(ts) AS day, count() FROM events GROUP BY day'); + }); + + it('keeps a column/alias literally named format', function () { + const sql = 'SELECT id AS format FROM t'; + expect(stripTrailingFormatClause(sql)).to.equal(sql); + }); + + it('keeps a real FORMAT clause even when a column is named format', function () { + expect( + stripTrailingFormatClause('SELECT format FROM t FORMAT JSON'), + ).to.equal('SELECT format FROM t'); + }); + }); + + describe('is aware of quotes, comments, and parentheses', function () { + it('ignores FORMAT inside a string literal', function () { + const sql = "SELECT 'FORMAT CSV' AS note FROM t"; + expect(stripTrailingFormatClause(sql)).to.equal(sql); + }); + + it('ignores FORMAT inside a line comment', function () { + const sql = 'SELECT 1 -- FORMAT CSV'; + expect(stripTrailingFormatClause(sql)).to.equal(sql); + }); + + it('ignores FORMAT inside a # line comment (MySQL-style)', function () { + const sql = 'SELECT 1 # FORMAT CSV'; + expect(stripTrailingFormatClause(sql)).to.equal(sql); + }); + + it('strips a real FORMAT clause trailed by a # comment', function () { + expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV # note')).to.equal( + 'SELECT 1', + ); + }); + + it('ignores FORMAT inside a block comment', function () { + const sql = 'SELECT 1 /* FORMAT CSV */'; + expect(stripTrailingFormatClause(sql)).to.equal(sql); + }); + + it('ignores a FORMAT nested inside a subquery (not trailing)', function () { + const sql = 'SELECT * FROM (SELECT 1 FORMAT CSV) AS x'; + expect(stripTrailingFormatClause(sql)).to.equal(sql); + }); + }); + + describe('no-ops', function () { + it('leaves a plain query untouched', function () { + const sql = 'SELECT count(*) FROM events'; + expect(stripTrailingFormatClause(sql)).to.equal(sql); + }); + + it('returns empty input unchanged', function () { + expect(stripTrailingFormatClause('')).to.equal(''); + }); + + it('does not manufacture an empty query from a bare FORMAT clause', function () { + expect(stripTrailingFormatClause('FORMAT JSON')).to.equal('FORMAT JSON'); + }); + }); + + describe('produces a query that wraps cleanly (the bug being fixed)', function () { + const cases = [ + 'SELECT * FROM events FORMAT CSV', + 'SELECT a, b FROM t FORMAT CSVWithNames;', + 'SELECT x FROM t ORDER BY x; FORMAT JSON', + 'SELECT formatDateTime(ts) AS day FROM events FORMAT CSV', + ]; + + cases.forEach(function (sql) { + it(`leaves no FORMAT inside the parens for: ${sql}`, function () { + const wrapped = wrap(stripTrailingFormatClause(sql)); + // Exactly one FORMAT clause survives — the outer one the API adds. + expect(countFormatClauses(wrapped)).to.equal(1); + // ...and it sits at the very end, outside the subquery parentheses. + expect(wrapped).to.match(/\)\s+LIMIT\s+100\s+FORMAT\s+JSON$/); + const innerParens = wrapped.slice( + wrapped.indexOf('(') + 1, + wrapped.lastIndexOf(')'), + ); + expect(countFormatClauses(innerParens)).to.equal(0); + }); + }); + }); +});