-
Notifications
You must be signed in to change notification settings - Fork 0
Strip trailing FORMAT clauses from SQL queries before sending #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+332
−1
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,159 @@ | ||
| /** | ||
| * SQL helpers for the query command. | ||
| * | ||
| * The Formo query API wraps the SQL you submit so it can paginate the result | ||
| * set and force a machine-readable response: | ||
| * | ||
| * SELECT * FROM (<your query>) LIMIT <n> FORMAT JSON | ||
| * | ||
| * ClickHouse does not allow a `FORMAT` clause inside a subquery, so if your | ||
| * query ends in its own `FORMAT` clause the wrapped statement becomes: | ||
| * | ||
| * SELECT * FROM (SELECT ... FORMAT CSV) LIMIT 100 FORMAT JSON | ||
| * | ||
| * which ClickHouse rejects with a 400. A trailing `FORMAT` (or a trailing | ||
| * semicolon) can never take effect through this endpoint anyway — the outer | ||
| * `FORMAT JSON` always wins, and output shaping is the CLI's `--format` job — | ||
| * so we remove it before sending and let the server wrap a clean query. | ||
| */ | ||
|
|
||
| /** | ||
| * Strip a trailing, top-level `FORMAT <name>` clause and any trailing | ||
| * semicolons from a SQL statement. | ||
| * | ||
| * The scan is aware of string literals, quoted identifiers, and comments, so | ||
| * `FORMAT`-looking text inside them is never mistaken for a real clause. The | ||
| * match is anchored to the end of the statement, so a `FORMAT` nested inside | ||
| * parentheses (a subquery) or part of an identifier/function such as | ||
| * `formatDateTime(...)` — or a column aliased `format` — is left untouched. | ||
| * | ||
| * Returns the original input unchanged when there is nothing to strip. | ||
| */ | ||
| export function stripTrailingFormatClause(sql: string): string { | ||
| if (!sql) return sql | ||
|
|
||
| const masked = maskLiterals(sql) | ||
| // A genuine trailing FORMAT clause: the keyword `format` preceded by a word | ||
| // boundary (so `formatDateTime` or a column aliased `format` is safe), | ||
| // followed by exactly one identifier (the format name) and nothing else. | ||
| const formatClause = /(^|[^A-Za-z0-9_])format\s+[A-Za-z_][A-Za-z0-9_]*\s*$/i | ||
|
|
||
| let end = sql.length | ||
| let didStrip = false | ||
| // Peel top-level semicolons and a trailing FORMAT clause repeatedly so any | ||
| // ordering collapses to the bare query, e.g. `... FORMAT CSV;`, | ||
| // `...; FORMAT CSV`, or `... FORMAT CSV ;`. Trailing whitespace and comments | ||
| // are only ever skipped to *look* past them — never removed on their own. | ||
| for (;;) { | ||
| let e = end | ||
| while (e > 0 && /\s/.test(masked[e - 1])) e-- | ||
| if (e === 0) break | ||
|
|
||
| if (masked[e - 1] === ';') { | ||
| end = e - 1 | ||
| didStrip = true | ||
| continue | ||
| } | ||
|
|
||
| const match = formatClause.exec(masked.slice(0, e)) | ||
| if (match) { | ||
| // Cut at the `format` keyword, after the leading word-boundary char. | ||
| end = match.index + match[1].length | ||
| didStrip = true | ||
| continue | ||
| } | ||
|
|
||
| break | ||
| } | ||
|
|
||
| if (!didStrip) return sql | ||
|
|
||
| // Tidy the real whitespace now left dangling where the clause used to be. | ||
| // Comments are not whitespace, so any genuine comment is preserved. | ||
| const stripped = sql.slice(0, end).replace(/\s+$/, '') | ||
| // Never manufacture an empty query from non-empty input (degenerate inputs | ||
| // such as a bare `FORMAT JSON`): let the original surface its own error. | ||
| return stripped === '' ? sql : stripped | ||
| } | ||
|
|
||
| /** | ||
| * Return a copy of `sql` with the *contents* of string literals, quoted | ||
| * identifiers, and comments replaced by spaces, preserving the original length | ||
| * so indices stay aligned with the source. Quote delimiters are kept; comments | ||
| * are blanked entirely. This lets the clause scanner reason about real SQL code | ||
| * without tripping over keywords that merely appear inside literals or comments. | ||
| */ | ||
| function maskLiterals(sql: string): string { | ||
| const out: string[] = [] | ||
| const n = sql.length | ||
| let i = 0 | ||
|
|
||
| while (i < n) { | ||
| const c = sql[i] | ||
| const next = i + 1 < n ? sql[i + 1] : '' | ||
|
|
||
| // Line comment: -- ... or # ... to end of line. ClickHouse accepts `#` | ||
| // and `#!` line comments for MySQL compatibility. | ||
| if ((c === '-' && next === '-') || c === '#') { | ||
| while (i < n && sql[i] !== '\n') { | ||
| out.push(' ') | ||
| i++ | ||
| } | ||
| continue | ||
| } | ||
|
|
||
| // Block comment: /* ... */ | ||
| if (c === '/' && next === '*') { | ||
| out.push(' ', ' ') | ||
| i += 2 | ||
| while (i < n && !(sql[i] === '*' && sql[i + 1] === '/')) { | ||
| out.push(' ') | ||
| i++ | ||
| } | ||
| if (i < n) { | ||
| out.push(' ', ' ') | ||
| i += 2 | ||
| } | ||
| continue | ||
| } | ||
|
|
||
| // String literal or quoted identifier: '...', "...", `...` | ||
| if (c === "'" || c === '"' || c === '`') { | ||
| const quote = c | ||
| out.push(quote) | ||
| i++ | ||
| while (i < n) { | ||
| const d = sql[i] | ||
| // Backslash escapes are honored inside single-quoted strings. | ||
| if (d === '\\' && quote === "'") { | ||
| out.push(' ') | ||
| i++ | ||
| if (i < n) { | ||
| out.push(' ') | ||
| i++ | ||
| } | ||
| continue | ||
| } | ||
| if (d === quote) { | ||
| // A doubled delimiter is an escaped quote, not the terminator. | ||
| if (i + 1 < n && sql[i + 1] === quote) { | ||
| out.push(' ', ' ') | ||
| i += 2 | ||
| continue | ||
| } | ||
| out.push(quote) | ||
| i++ | ||
| break | ||
| } | ||
| out.push(' ') | ||
| i++ | ||
| } | ||
| continue | ||
| } | ||
|
|
||
| out.push(c) | ||
| i++ | ||
| } | ||
|
|
||
| return out.join('') | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,168 @@ | ||
| import { expect } from 'chai'; | ||
| import { stripTrailingFormatClause } from '../../src/lib/sql'; | ||
|
|
||
| /** | ||
| * Mirror what the API does to a submitted query: wrap it in a paginating | ||
| * subquery with a single outer FORMAT JSON. Used to assert the stripped query | ||
| * produces a statement with no FORMAT inside the parentheses. | ||
| */ | ||
| function wrap(inner: string): string { | ||
| return `SELECT * FROM (${inner}) LIMIT 100 FORMAT JSON`; | ||
| } | ||
|
|
||
| /** Count real `FORMAT <name>` clauses (ignores formatDateTime, etc.). */ | ||
| function countFormatClauses(sql: string): number { | ||
| const matches = sql.match(/\bformat\s+[A-Za-z_][A-Za-z0-9_]*/gi); | ||
| return matches ? matches.length : 0; | ||
| } | ||
|
|
||
| describe('lib/sql / stripTrailingFormatClause', function () { | ||
| describe('strips a trailing top-level FORMAT clause', function () { | ||
| it('removes FORMAT CSV', function () { | ||
| expect(stripTrailingFormatClause('SELECT * FROM events FORMAT CSV')).to.equal( | ||
| 'SELECT * FROM events', | ||
| ); | ||
| }); | ||
|
|
||
| it('removes FORMAT CSVWithNames', function () { | ||
| expect( | ||
| stripTrailingFormatClause('SELECT a, b FROM t FORMAT CSVWithNames'), | ||
| ).to.equal('SELECT a, b FROM t'); | ||
| }); | ||
|
|
||
| it('removes FORMAT JSON', function () { | ||
| expect(stripTrailingFormatClause('SELECT 1 FORMAT JSON')).to.equal('SELECT 1'); | ||
| }); | ||
|
|
||
| it('is case-insensitive on the keyword and name', function () { | ||
| expect(stripTrailingFormatClause('SELECT 1 format json')).to.equal('SELECT 1'); | ||
| expect(stripTrailingFormatClause('SELECT 1 Format JSONEachRow')).to.equal( | ||
| 'SELECT 1', | ||
| ); | ||
| }); | ||
|
|
||
| it('handles newlines and extra whitespace before FORMAT', function () { | ||
| expect( | ||
| stripTrailingFormatClause('SELECT 1\n FROM t\nFORMAT TabSeparated'), | ||
| ).to.equal('SELECT 1\n FROM t'); | ||
| }); | ||
| }); | ||
|
|
||
| describe('handles trailing semicolons', function () { | ||
| it('removes a bare trailing semicolon', function () { | ||
| expect(stripTrailingFormatClause('SELECT 1;')).to.equal('SELECT 1'); | ||
| }); | ||
|
|
||
| it('removes a semicolon after a FORMAT clause', function () { | ||
| expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV;')).to.equal('SELECT 1'); | ||
| }); | ||
|
|
||
| it('removes a FORMAT clause that follows a semicolon', function () { | ||
| expect( | ||
| stripTrailingFormatClause('SELECT x FROM t ORDER BY x; FORMAT CSV'), | ||
| ).to.equal('SELECT x FROM t ORDER BY x'); | ||
| }); | ||
|
|
||
| it('removes whitespace and multiple trailing semicolons around FORMAT', function () { | ||
| expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV ; ')).to.equal('SELECT 1'); | ||
| }); | ||
| }); | ||
|
|
||
| describe('does not truncate FORMAT-like identifiers (no false positives)', function () { | ||
| it('keeps a formatDateTime(...) call with no trailing clause', function () { | ||
| const sql = 'SELECT formatDateTime(ts, \'%Y-%m-%d\') AS day FROM events'; | ||
| expect(stripTrailingFormatClause(sql)).to.equal(sql); | ||
| }); | ||
|
|
||
| it('strips only the real clause, keeping formatDateTime(...) intact', function () { | ||
| expect( | ||
| stripTrailingFormatClause( | ||
| 'SELECT formatDateTime(ts) AS day, count() FROM events GROUP BY day FORMAT CSV', | ||
| ), | ||
| ).to.equal('SELECT formatDateTime(ts) AS day, count() FROM events GROUP BY day'); | ||
| }); | ||
|
|
||
| it('keeps a column/alias literally named format', function () { | ||
| const sql = 'SELECT id AS format FROM t'; | ||
| expect(stripTrailingFormatClause(sql)).to.equal(sql); | ||
| }); | ||
|
|
||
| it('keeps a real FORMAT clause even when a column is named format', function () { | ||
| expect( | ||
| stripTrailingFormatClause('SELECT format FROM t FORMAT JSON'), | ||
| ).to.equal('SELECT format FROM t'); | ||
| }); | ||
| }); | ||
|
|
||
| describe('is aware of quotes, comments, and parentheses', function () { | ||
| it('ignores FORMAT inside a string literal', function () { | ||
| const sql = "SELECT 'FORMAT CSV' AS note FROM t"; | ||
| expect(stripTrailingFormatClause(sql)).to.equal(sql); | ||
| }); | ||
|
|
||
| it('ignores FORMAT inside a line comment', function () { | ||
| const sql = 'SELECT 1 -- FORMAT CSV'; | ||
| expect(stripTrailingFormatClause(sql)).to.equal(sql); | ||
| }); | ||
|
|
||
| it('ignores FORMAT inside a # line comment (MySQL-style)', function () { | ||
| const sql = 'SELECT 1 # FORMAT CSV'; | ||
| expect(stripTrailingFormatClause(sql)).to.equal(sql); | ||
| }); | ||
|
|
||
| it('strips a real FORMAT clause trailed by a # comment', function () { | ||
| expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV # note')).to.equal( | ||
| 'SELECT 1', | ||
| ); | ||
| }); | ||
|
|
||
| it('ignores FORMAT inside a block comment', function () { | ||
| const sql = 'SELECT 1 /* FORMAT CSV */'; | ||
| expect(stripTrailingFormatClause(sql)).to.equal(sql); | ||
| }); | ||
|
|
||
| it('ignores a FORMAT nested inside a subquery (not trailing)', function () { | ||
| const sql = 'SELECT * FROM (SELECT 1 FORMAT CSV) AS x'; | ||
| expect(stripTrailingFormatClause(sql)).to.equal(sql); | ||
| }); | ||
| }); | ||
|
|
||
| describe('no-ops', function () { | ||
| it('leaves a plain query untouched', function () { | ||
| const sql = 'SELECT count(*) FROM events'; | ||
| expect(stripTrailingFormatClause(sql)).to.equal(sql); | ||
| }); | ||
|
|
||
| it('returns empty input unchanged', function () { | ||
| expect(stripTrailingFormatClause('')).to.equal(''); | ||
| }); | ||
|
|
||
| it('does not manufacture an empty query from a bare FORMAT clause', function () { | ||
| expect(stripTrailingFormatClause('FORMAT JSON')).to.equal('FORMAT JSON'); | ||
| }); | ||
| }); | ||
|
|
||
| describe('produces a query that wraps cleanly (the bug being fixed)', function () { | ||
| const cases = [ | ||
| 'SELECT * FROM events FORMAT CSV', | ||
| 'SELECT a, b FROM t FORMAT CSVWithNames;', | ||
| 'SELECT x FROM t ORDER BY x; FORMAT JSON', | ||
| 'SELECT formatDateTime(ts) AS day FROM events FORMAT CSV', | ||
| ]; | ||
|
|
||
| cases.forEach(function (sql) { | ||
| it(`leaves no FORMAT inside the parens for: ${sql}`, function () { | ||
| const wrapped = wrap(stripTrailingFormatClause(sql)); | ||
| // Exactly one FORMAT clause survives — the outer one the API adds. | ||
| expect(countFormatClauses(wrapped)).to.equal(1); | ||
| // ...and it sits at the very end, outside the subquery parentheses. | ||
| expect(wrapped).to.match(/\)\s+LIMIT\s+100\s+FORMAT\s+JSON$/); | ||
| const innerParens = wrapped.slice( | ||
| wrapped.indexOf('(') + 1, | ||
| wrapped.lastIndexOf(')'), | ||
| ); | ||
| expect(countFormatClauses(innerParens)).to.equal(0); | ||
| }); | ||
| }); | ||
| }); | ||
| }); | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.