diff --git a/__fixtures__/dev-env-e2e/mydumper-detection.expected.sql b/__fixtures__/dev-env-e2e/mydumper-detection.expected.sql index 746290981..39f384597 100644 --- a/__fixtures__/dev-env-e2e/mydumper-detection.expected.sql +++ b/__fixtures__/dev-env-e2e/mydumper-detection.expected.sql @@ -1,5 +1,5 @@ --- metadata.header -1 +-- metadata.header 00000000000000000198 # Started dump at: 2024-07-26 03:00:36 [config] quote_character = BACKTICK @@ -8,7 +8,7 @@ quote_character = BACKTICK SQL_MODE='NO_AUTO_VALUE_ON_ZERO,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION' /*!40101 --- some_db-schema-create.sql -1 +-- some_db-schema-create.sql 00000000000000000358 /*!40101 SET NAMES utf8mb4*/; /*!40014 SET FOREIGN_KEY_CHECKS=0*/; /*!40101 SET SQL_MODE='NO_AUTO_VALUE_ON_ZERO,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'*/; diff --git a/__tests__/lib/database.js b/__tests__/lib/database.js new file mode 100644 index 000000000..2e7d24ba1 --- /dev/null +++ b/__tests__/lib/database.js @@ -0,0 +1,155 @@ +import fs from 'fs'; +import { Readable } from 'node:stream'; +import { pipeline } from 'node:stream/promises'; +import os from 'os'; +import path from 'path'; + +import { MyDumperSectionSizeTransform, patchMyDumperSectionSizes } from '../../src/lib/database'; + +/** + * Builds a mydumper-style stream: each section is a "-- " header line + * followed by its content (ending with its own newline) and a separator newline + * before the next header. The final section runs to end of stream. + */ +const buildStream = sections => + sections + .map( ( { name, declaredSize, content }, index ) => { + const separator = index === sections.length - 1 ? '' : '\n'; + return `-- ${ name } ${ declaredSize }\n${ content }${ separator }`; + } ) + .join( '' ); + +const runTransform = async ( input, { chunkSize = 8 } = {} ) => { + const chunks = []; + for ( let offset = 0; offset < input.length; offset += chunkSize ) { + chunks.push( Buffer.from( input.slice( offset, offset + chunkSize ), 'latin1' ) ); + } + + const transform = new MyDumperSectionSizeTransform(); + const outputFile = path.join( + fs.mkdtempSync( path.join( os.tmpdir(), 'mydumper-transform-test-' ) ), + 'out.sql' + ); + await pipeline( Readable.from( chunks ), transform, fs.createWriteStream( outputFile ) ); + await patchMyDumperSectionSizes( outputFile, transform ); + + return { transform, outputFile, output: fs.readFileSync( outputFile, 'latin1' ) }; +}; + +const parseHeaders = output => { + const headers = []; + const regex = /^-- ([^ ]+) (\d+)$/gm; + let match; + while ( ( match = regex.exec( output ) ) !== null ) { + headers.push( { + name: match[ 1 ], + size: parseInt( match[ 2 ], 10 ), + start: match.index, + end: match.index + match[ 0 ].length, + } ); + } + return headers; +}; + +describe( 'lib/database', () => { + describe( 'MyDumperSectionSizeTransform', () => { + it( 'recomputes section sizes from actual content (stale sizes after search-replace)', async () => { + // Declared sizes are stale on purpose: content was "search-replaced". + const metadata = '# Started dump\n[config]\nquote-character = BACKTICK\n'; + const schema = 'CREATE TABLE `wp_options` (`id` bigint);\n'; + const data = "INSERT INTO `wp_options` VALUES (1,'new.domain');\n"; + const input = buildStream( [ + { name: 'metadata.header', declaredSize: 9999, content: metadata }, + { name: 'db.wp_options-schema.sql', declaredSize: 1, content: schema }, + { name: 'db.wp_options.00000.sql', declaredSize: 12345, content: data }, + ] ); + + const { output } = await runTransform( input ); + const headers = parseHeaders( output ); + + expect( headers ).toHaveLength( 3 ); + expect( headers[ 0 ].size ).toBe( metadata.length ); + expect( headers[ 1 ].size ).toBe( schema.length ); + expect( headers[ 2 ].size ).toBe( data.length ); + } ); + + it( 'preserves content bytes exactly and keeps the size convention parseable', async () => { + const content1 = 'line one\nline two\n'; + const content2 = "INSERT INTO `t` VALUES ('x');\n"; + const input = buildStream( [ + { name: 'metadata.header', declaredSize: 0, content: content1 }, + { name: 'db.t.00000.sql', declaredSize: 0, content: content2 }, + ] ); + + const { output } = await runTransform( input ); + const headers = parseHeaders( output ); + + // Re-derive each section's content from the declared size and compare: + // size counts content bytes; one separator newline precedes the next header. + const section1 = output.slice( + headers[ 0 ].end + 1, + headers[ 0 ].end + 1 + headers[ 0 ].size + ); + expect( section1 ).toBe( content1 ); + expect( output[ headers[ 0 ].end + 1 + headers[ 0 ].size ] ).toBe( '\n' ); + + const section2 = output.slice( + headers[ 1 ].end + 1, + headers[ 1 ].end + 1 + headers[ 1 ].size + ); + expect( section2 ).toBe( content2 ); + } ); + + it.each( [ 1, 3, 7, 64 ] )( + 'is chunk-boundary safe (chunk size %i splits headers mid-line)', + async chunkSize => { + const content = 'some content here\n'; + const input = buildStream( [ + { name: 'metadata.header', declaredSize: 5, content }, + { name: 'db.table-schema.sql', declaredSize: 5, content }, + ] ); + + const { output } = await runTransform( input, { chunkSize } ); + const headers = parseHeaders( output ); + + expect( headers ).toHaveLength( 2 ); + expect( headers[ 0 ].size ).toBe( content.length ); + expect( headers[ 1 ].size ).toBe( content.length ); + } + ); + + it( 'documents the known limitation: content lines shaped "-- " are treated as headers', async () => { + // mydumper never emits such content lines (string newlines are escaped, so content + // cannot begin a line with "-- "), but a hand-edited dump could contain one. This + // test documents the behavior so a future change here is deliberate, not accidental. + const content = 'real content\n-- handwritten_note 42\nmore content\n'; + const input = buildStream( [ { name: 'metadata.header', declaredSize: 1, content } ] ); + + const { transform } = await runTransform( input ); + + // The phantom line is counted as a section boundary. + expect( transform.fixups ).toHaveLength( 2 ); + } ); + + it( 'does not treat header-looking content lines without a numeric size as headers', async () => { + const content = '-- this is just a comment\n-- not a header either\n'; + const input = buildStream( [ { name: 'metadata.header', declaredSize: 1, content } ] ); + + const { output, transform } = await runTransform( input ); + + expect( transform.fixups ).toHaveLength( 1 ); + expect( parseHeaders( output ) ).toHaveLength( 1 ); + expect( output ).toContain( content ); + } ); + + it( 'handles a final section without a trailing newline', async () => { + const content = 'no trailing newline'; + const input = `-- metadata.header 5\n${ content }`; + + const { output, transform } = await runTransform( input ); + + expect( transform.fixups[ 0 ].size ).toBe( content.length ); + expect( parseHeaders( output )[ 0 ].size ).toBe( content.length ); + } ); + } ); +} ); diff --git a/__tests__/lib/search-and-replace.js b/__tests__/lib/search-and-replace.js index 9212e6e85..55b362821 100644 --- a/__tests__/lib/search-and-replace.js +++ b/__tests__/lib/search-and-replace.js @@ -43,6 +43,10 @@ describe( 'lib/search-and-replace', () => { new Error( 'No search and replace parameters provided.' ) ); } ); + it( 'should throw for compressed input files', async () => { + const promise = searchAndReplace( '/tmp/some-dump.sql.GZ', 'a,b', {}, binary ); + await expect( promise ).rejects.toThrow( 'Compressed files are not supported' ); + } ); it( 'will accept and use a string of replacement pairs (when one replacement provided)', async () => { // Mock the confirmation prompt so it doesn't actually prompt, and manipulate the resolved value const promptMock = await jest.spyOn( prompt, 'confirm' ).mockResolvedValue( true ); @@ -86,6 +90,66 @@ describe( 'lib/search-and-replace', () => { fs.unlinkSync( outputFileName ); } ); + it( 'recomputes mydumper section header sizes after replacement changes content length', async () => { + // Minimal mydumper-format stream dump. Sizes follow the mydumper convention: + // content bytes including the content's own trailing newline, with a single + // separator newline before the next header; final section runs to EOF. + const metadata = '# Started dump\n[config]\nquote-character = BACKTICK\n'; + const schemaCreate = 'CREATE DATABASE `testdb`;\n'; + const data = "INSERT INTO `wp_options` VALUES ('ohai world, ohai');\n"; + const myDumperFile = path.join( + fs.mkdtempSync( path.join( require( 'os' ).tmpdir(), 'mydumper-sr-test-' ) ), + 'dump.sql' + ); + fs.writeFileSync( + myDumperFile, + `-- metadata.header ${ metadata.length }\n${ metadata }\n` + + `-- testdb-schema-create.sql ${ schemaCreate.length }\n${ schemaCreate }\n` + + `-- testdb.wp_options.00000.sql ${ data.length }\n${ data }` + ); + + const { outputFileName } = await searchAndReplace( + myDumperFile, + 'ohai,ohHeyLongerValue', + { output: true }, + binary + ); + + const result = fs.readFileSync( outputFileName, { encoding: 'utf-8' } ); + + // Replacement happened + expect( result ).toContain( 'ohHeyLongerValue' ); + expect( result ).not.toContain( 'ohai' ); + + // Every header's declared size must match the actual content that follows it + const headerRegex = /^-- ([^ ]+) (\d+)$/gm; + const headers = []; + let match; + while ( ( match = headerRegex.exec( result ) ) !== null ) { + headers.push( { + size: parseInt( match[ 2 ], 10 ), + contentStart: match.index + match[ 0 ].length + 1, + } ); + } + expect( headers ).toHaveLength( 3 ); + + headers.forEach( ( { size, contentStart } ) => { + expect( result.slice( contentStart, contentStart + size ).length ).toBe( size ); + } ); + + // a single separator newline must follow each section's content before the next header + headers.slice( 0, -1 ).forEach( ( { size, contentStart } ) => { + expect( result[ contentStart + size ] ).toBe( '\n' ); + expect( result.slice( contentStart + size + 1, contentStart + size + 4 ) ).toBe( '-- ' ); + } ); + + // final section runs exactly to end of stream + const lastHeader = headers[ headers.length - 1 ]; + expect( lastHeader.contentStart + lastHeader.size ).toBe( result.length ); + + fs.unlinkSync( outputFileName ); + } ); + it( 'will remove whitespace from the beginning and end of pairs', async () => { jest.spyOn( searchReplaceLib, 'replace' ); const replaceSpy = searchReplaceLib.replace; diff --git a/src/commands/dev-env-sync-sql.ts b/src/commands/dev-env-sync-sql.ts index fb2a2ff5b..768952aed 100644 --- a/src/commands/dev-env-sync-sql.ts +++ b/src/commands/dev-env-sync-sql.ts @@ -17,7 +17,12 @@ import API from '../lib/api'; import { BackupStorageAvailability } from '../lib/backup-storage-availability/backup-storage-availability'; import * as exit from '../lib/cli/exit'; import { unzipFile } from '../lib/client-file-uploader'; -import { fixMyDumperTransform, getSqlDumpDetails, SqlDumpType } from '../lib/database'; +import { + MyDumperSectionSizeTransform, + getSqlDumpDetails, + patchMyDumperSectionSizes, + SqlDumpType, +} from '../lib/database'; import { LiveBackupCopyCLIOptions } from '../lib/live-backup-copy'; import { makeTempDir } from '../lib/utils'; import { getReadInterface } from '../lib/validations/line-by-line'; @@ -209,12 +214,20 @@ export class DevEnvSyncSQLCommand { const outputFile = `${ this.tmpDir }/sql-export-sr.sql`; const transforms: NodeJS.ReadWriteStream[] = []; + let myDumperTransform: MyDumperSectionSizeTransform | undefined; if ( this.getSqlDumpType() === SqlDumpType.MYDUMPER ) { - transforms.push( fixMyDumperTransform() ); + myDumperTransform = new MyDumperSectionSizeTransform(); + transforms.push( myDumperTransform ); } await pipeline( replacedStream, ...transforms, fs.createWriteStream( outputFile ) ); + if ( myDumperTransform ) { + // Replace the size placeholders in the section headers with the recomputed + // sizes; myloader needs them to parse the stream correctly. + await patchMyDumperSectionSizes( outputFile, myDumperTransform ); + } + fs.renameSync( outputFile, this.sqlFile ); } diff --git a/src/lib/database.ts b/src/lib/database.ts index 3e527345e..28639e3be 100644 --- a/src/lib/database.ts +++ b/src/lib/database.ts @@ -106,26 +106,162 @@ const getSqlFileStreamFromCompressedFile = async ( filePath: string ): Promise< throw new Error( 'Not a supported compressed file' ); }; -export const fixMyDumperTransform = () => { - const regex = /^-- ([^ ]+) \d+$/; - return new Transform( { - transform( chunk: string, _encoding: BufferEncoding, callback: TransformCallback ) { - const chunkString = chunk.toString(); - const lineEnding = chunkString.includes( '\r\n' ) ? '\r\n' : '\n'; - const lines = chunk - .toString() - .split( lineEnding ) - .map( line => { - const match = regex.exec( line ); - - if ( ! match ) { - return line; - } - - const tablePart = match[ 1 ]; - return `-- ${ tablePart } -1`; +interface MyDumperSectionSizeFixup { + /** Byte offset in the output file where the fixed-width size field begins. */ + sizeOffset: number; + /** Recomputed byte size of the section content; -1 until the section is closed. */ + size: number; +} + +/** + * Width of the zero-padded size placeholder we emit in section headers. + * 20 digits fits any uint64, so the field never needs to grow when patched. + */ +const MYDUMPER_SIZE_FIELD_WIDTH = 20; + +const NEWLINE = 0x0a; +/** + * Known limitation: a *content* line that happens to look like "-- " would be + * misidentified as a section header (same assumption as the previous implementation). In + * practice mydumper never emits such lines — string values have their newlines escaped, so + * content cannot start a line with "-- " — and a scan of a 201k-table production dump found + * exactly zero false matches. Tightening the filename grammar instead would risk *missing* + * real headers (merging sections), which is strictly worse than a phantom match. + */ +const MYDUMPER_HEADER_REGEX = /^-- ([^ ]+) \d+\n$/; +// Header lines are short ("-- "); anything longer is data. +const MYDUMPER_HEADER_MAX_LENGTH = 1024; + +/** + * Rewrites mydumper stream section headers ("-- ") with sizes recomputed + * from the actual (post search-replace) content. + * + * Search-replace changes content lengths, so the original sizes become wrong. myloader uses + * the size to tell a real section header apart from header-looking *content* inside a file: + * while fewer bytes than the declared size have been written, a header line is treated as + * content. The previous implementation rewrote sizes to "-1", which myloader >= 0.20 parses + * with g_ascii_strtoull() into ULLONG_MAX — making it swallow every subsequent header as + * content of the first section and import nothing. + * + * Sizes are not knowable while streaming, so this transform emits a fixed-width zero-padded + * placeholder, counts each section's content bytes as they pass through, and records the + * placeholder's byte offset. Callers writing to a file must call patchMyDumperSectionSizes() + * afterwards to overwrite the placeholders in place (same byte length, so offsets are stable). + * + * Size convention (verified against mydumper output): a section's size counts its content + * bytes including the content's own trailing newline, but not the single separator newline + * that precedes the next header. The final section runs to end of stream. + */ +export class MyDumperSectionSizeTransform extends Transform { + public readonly fixups: MyDumperSectionSizeFixup[] = []; + + private leftover: Buffer = Buffer.alloc( 0 ); + private bytesOut = 0; + private contentStart = -1; + + public constructor() { + super(); + } + + public _transform( chunk: Buffer, _encoding: BufferEncoding, callback: TransformCallback ) { + const data = this.leftover.length ? Buffer.concat( [ this.leftover, chunk ] ) : chunk; + const out: Buffer[] = []; + + let lineStart = 0; + let newlineIndex = data.indexOf( NEWLINE, lineStart ); + while ( newlineIndex !== -1 ) { + this.processLine( data.subarray( lineStart, newlineIndex + 1 ), out ); + lineStart = newlineIndex + 1; + newlineIndex = data.indexOf( NEWLINE, lineStart ); + } + + // Keep the trailing partial line for the next chunk. Copy it: `data` may alias + // the incoming chunk, whose underlying memory the stream may reuse. + this.leftover = Buffer.from( data.subarray( lineStart ) ); + + callback( null, out.length ? Buffer.concat( out ) : Buffer.alloc( 0 ) ); + } + + public _flush( callback: TransformCallback ) { + let tail: Buffer | undefined; + if ( this.leftover.length ) { + tail = this.leftover; + this.bytesOut += tail.length; + this.leftover = Buffer.alloc( 0 ); + } + + // Close the final section: it runs to the end of the stream. + const pending = this.fixups[ this.fixups.length - 1 ]; + if ( pending && pending.size === -1 ) { + pending.size = this.bytesOut - this.contentStart; + } + + callback( null, tail ); + } + + private processLine( line: Buffer, out: Buffer[] ) { + if ( + line.length <= MYDUMPER_HEADER_MAX_LENGTH && + line[ 0 ] === 0x2d && // '-' + line[ 1 ] === 0x2d && // '-' + line[ 2 ] === 0x20 // ' ' + ) { + const match = MYDUMPER_HEADER_REGEX.exec( line.toString( 'latin1' ) ); + if ( match ) { + // Close the previous section. Its content ends one separator newline + // before this header. + const pending = this.fixups[ this.fixups.length - 1 ]; + if ( pending && pending.size === -1 ) { + pending.size = Math.max( this.bytesOut - this.contentStart - 1, 0 ); + } + + const name = match[ 1 ]; + const header = Buffer.from( + `-- ${ name } ${ '0'.repeat( MYDUMPER_SIZE_FIELD_WIDTH ) }\n`, + 'latin1' + ); + this.fixups.push( { + sizeOffset: this.bytesOut + 3 + name.length + 1, + size: -1, } ); - callback( null, lines.join( lineEnding ) ); - }, - } ); -}; + out.push( header ); + this.bytesOut += header.length; + this.contentStart = this.bytesOut; + return; + } + } + + out.push( line ); + this.bytesOut += line.length; + } +} + +/** + * Backwards-compatible factory; see MyDumperSectionSizeTransform. + */ +export const fixMyDumperTransform = (): MyDumperSectionSizeTransform => + new MyDumperSectionSizeTransform(); + +/** + * Overwrites the size placeholders emitted by MyDumperSectionSizeTransform with the + * recomputed sizes. Must be called after the write stream has finished. The replacement + * is the same byte length as the placeholder, so all recorded offsets stay valid. + */ +export async function patchMyDumperSectionSizes( + filePath: string, + transform: MyDumperSectionSizeTransform +): Promise< void > { + const fileHandle = await fs.promises.open( filePath, 'r+' ); + try { + for ( const fixup of transform.fixups ) { + if ( fixup.size < 0 ) { + continue; + } + const sizeField = String( fixup.size ).padStart( MYDUMPER_SIZE_FIELD_WIDTH, '0' ); + // eslint-disable-next-line no-await-in-loop -- intentionally sequential: positional writes on a single file handle + await fileHandle.write( sizeField, fixup.sizeOffset, 'latin1' ); + } + } finally { + await fileHandle.close(); + } +} diff --git a/src/lib/search-and-replace.ts b/src/lib/search-and-replace.ts index 60de40a54..e08113578 100644 --- a/src/lib/search-and-replace.ts +++ b/src/lib/search-and-replace.ts @@ -6,7 +6,12 @@ import { Readable, Writable } from 'node:stream'; import { pipeline } from 'node:stream/promises'; import path from 'path'; -import { fixMyDumperTransform, getSqlDumpDetails, SqlDumpType } from './database'; +import { + MyDumperSectionSizeTransform, + getSqlDumpDetails, + patchMyDumperSectionSizes, + SqlDumpType, +} from './database'; import { makeTempDir } from './utils'; import * as exit from '../lib/cli/exit'; import { confirm } from '../lib/cli/prompt'; @@ -122,6 +127,14 @@ export const searchAndReplace = async ( }: SearchReplaceOptions, binary: string | null = null ): Promise< SearchReplaceOutput > => { + // The replacement operates on raw bytes: a compressed file would pass through with no + // replacements applied and no indication of failure. + if ( fileName.toLowerCase().endsWith( '.gz' ) ) { + throw new Error( + 'Compressed files are not supported. Please decompress the file first (e.g. `gunzip -k file.sql.gz`) and run the operation on the .sql file.' + ); + } + const dumpDetails = await getSqlDumpDetails( fileName ); const isMyDumper = dumpDetails.type === SqlDumpType.MYDUMPER; @@ -181,8 +194,10 @@ export const searchAndReplace = async ( const transforms: NodeJS.ReadWriteStream[] = []; + let myDumperTransform: MyDumperSectionSizeTransform | undefined; if ( isMyDumper ) { - transforms.push( fixMyDumperTransform() ); + myDumperTransform = new MyDumperSectionSizeTransform(); + transforms.push( myDumperTransform ); } try { @@ -196,6 +211,19 @@ export const searchAndReplace = async ( throw error; } + if ( myDumperTransform ) { + if ( outputFileName ) { + // Replace the size placeholders in the section headers with the recomputed + // sizes; myloader needs them to parse the stream correctly. + await patchMyDumperSectionSizes( outputFileName, myDumperTransform ); + } else { + console.error( + chalk.yellow( 'Warning:' ), + 'Output was not written to a file, so mydumper section header sizes were left as placeholders. The result is not directly importable with myloader.' + ); + } + } + const endTime = process.hrtime( startTime ); const end = endTime[ 1 ] / 1000000; // time in ms