diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc index 9d3d46f18cb4..57c4cfd51dd4 100644 --- a/web/emcc/wasm_runtime.cc +++ b/web/emcc/wasm_runtime.cc @@ -130,20 +130,29 @@ void ArrayDecodeStorage(Tensor cpu_arr, TVMFFIByteArray* bytes, const std::strin const char* byte_data = bytes->data; const size_t byte_size = bytes->size; if (format == "f32-to-bf16" && dtype == "float32") { - const uint16_t* bf16 = reinterpret_cast(byte_data); - uint32_t* data = static_cast(cpu_arr->data); TVM_FFI_ICHECK(cpu_arr.IsContiguous()); size_t size = 1; for (int i = 0; i < cpu_arr->ndim; ++i) { size *= cpu_arr->shape[i]; } - TVM_FFI_ICHECK_EQ(size, byte_size / 2); - for (size_t i = 0; i < size; ++i) { - data[i] = static_cast(bf16[i]) << 16; + // The "f32-to-bf16" format encodes a float32 tensor as packed bf16 (2 + // bytes per element). When the byte_size matches that expectation, expand + // back to f32. If the byte_size matches the native float32 width + // (4 bytes per element), the payload is already raw float32; fall through + // to the generic byte copy. This makes the loader tolerant of weight + // shards produced by older / alternate quantisation pipelines that retain + // the "f32-to-bf16" tag without performing the bf16 truncation. + if (byte_size == size * sizeof(uint16_t)) { + const uint16_t* bf16 = reinterpret_cast(byte_data); + uint32_t* data = + reinterpret_cast(static_cast(cpu_arr->data) + cpu_arr->byte_offset); + for (size_t i = 0; i < size; ++i) { + data[i] = static_cast(bf16[i]) << 16; + } + return; } - } else { - cpu_arr.CopyFromBytes(byte_data, byte_size); } + cpu_arr.CopyFromBytes(byte_data, byte_size); } TVM_FFI_STATIC_INIT_BLOCK() { diff --git a/web/src/runtime.ts b/web/src/runtime.ts index 078a0c7df21f..80d76f4161b4 100644 --- a/web/src/runtime.ts +++ b/web/src/runtime.ts @@ -1010,9 +1010,11 @@ export class Instance implements Disposable { */ withNewScope(action: () => T): T { this.beginScope(); - const val = action(); - this.endScope(); - return val; + try { + return action(); + } finally { + this.endScope(); + } } /** @@ -1323,6 +1325,23 @@ export class Instance implements Disposable { artifactCache: ArtifactCacheTemplate, signal?: AbortSignal, ) { + // Avoid a single JS-to-wasm byte-array call for multi-hundred-MiB + // tensor-cache records. The cap is a conservative per-call staging size, + // independent of the final tensor allocation size. Smaller records keep + // the existing full-record path. + const maxChunkBytes = 128 * 1024 * 1024; + const storageBytes = (dtype: string) => { + if (dtype === "bool") { + return 1; + } + const match = dtype.match(/(\d+)(?:x(\d+))?$/); + if (match === null) { + throw new Error("Cannot determine storage width of dtype " + dtype); + } + const bits = Number(match[1]); + const lanes = match[2] === undefined ? 1 : Number(match[2]); + return (bits * lanes + 7) >> 3; + }; const perf = compact.getPerformance(); const tstart = perf.now(); let totalBytes = 0; @@ -1421,9 +1440,59 @@ export class Instance implements Disposable { this.empty(rec.shape, rec.dtype, this.cpu()) ) }); - const recSource = buffer.slice(rec.byteOffset, rec.byteOffset + rec.nbytes); + const shardBytes = buffer instanceof Uint8Array ? buffer : new Uint8Array(buffer); + const recSource = + rec.byteOffset === 0 && rec.nbytes === shardBytes.byteLength + ? shardBytes + : shardBytes.subarray(rec.byteOffset, rec.byteOffset + rec.nbytes); + const canChunkRecord = + rec.nbytes > maxChunkBytes && + rec.shape.length >= 1 && + Number.isInteger(rec.shape[0]) && + rec.shape[0] > 0 && + rec.nbytes % rec.shape[0] === 0; + const outerDim = canChunkRecord ? rec.shape[0] : 1; + const sourceStrideBytes = canChunkRecord ? rec.nbytes / outerDim : rec.nbytes; + const targetBytes = rec.shape.reduce((acc, value) => acc * value, 1) * + storageBytes(rec.dtype); + const targetStrideBytes = canChunkRecord ? targetBytes / outerDim : targetBytes; + const copyRecordToTensor = (targetTensor: Tensor, sourceBytes: Uint8Array) => { + if (!canChunkRecord) { + this.ctx.arrayDecodeStorage(targetTensor, sourceBytes, rec.format, rec.dtype); + return; + } + const chunkOuterDim = Math.max(1, Math.floor(maxChunkBytes / sourceStrideBytes)); + for (let outerOffset = 0; outerOffset < outerDim; outerOffset += chunkOuterDim) { + const outerCount = Math.min(chunkOuterDim, outerDim - outerOffset); + const sourceByteOffset = outerOffset * sourceStrideBytes; + const targetByteOffset = outerOffset * targetStrideBytes; + const chunkBytes = outerCount * sourceStrideBytes; + const chunkShape = rec.shape.slice(); + chunkShape[0] = outerCount; + const chunkView = this.withNewScope(() => { + const chunkShapeTuple = this.makeShapeTuple(chunkShape); + return this.detachFromCurrentScope( + this.ctx.tensorCreateView( + targetTensor, + chunkShapeTuple, + rec.dtype, + new Scalar(targetByteOffset, "int"), + ) + ); + }); + const chunkSource = sourceBytes.subarray( + sourceByteOffset, + sourceByteOffset + chunkBytes, + ); + try { + this.ctx.arrayDecodeStorage(chunkView, chunkSource, rec.format, rec.dtype); + } finally { + chunkView.dispose(); + } + } + }; // first sync copy to cpu. - this.ctx.arrayDecodeStorage(cpu_arr, new Uint8Array(recSource), rec.format, rec.dtype); + copyRecordToTensor(cpu_arr, recSource); // then async stream into GPU if needed if (device.deviceType === DeviceStrToEnum.cpu) { this.tensorCacheUpdate(rec.name, cpu_arr, false); @@ -1435,7 +1504,42 @@ export class Instance implements Disposable { this.empty(rec.shape, rec.dtype, device) ) }); - gpu_arr.copyFrom(cpu_arr); + if (!canChunkRecord) { + gpu_arr.copyFrom(cpu_arr); + } else { + const chunkOuterDim = Math.max(1, Math.floor(maxChunkBytes / sourceStrideBytes)); + for (let outerOffset = 0; outerOffset < outerDim; outerOffset += chunkOuterDim) { + const outerCount = Math.min(chunkOuterDim, outerDim - outerOffset); + const targetByteOffset = outerOffset * targetStrideBytes; + const chunkShape = rec.shape.slice(); + chunkShape[0] = outerCount; + const [cpuView, gpuView] = this.withNewScope(() => { + const chunkShapeTuple = this.makeShapeTuple(chunkShape); + const cView = this.ctx.tensorCreateView( + cpu_arr, + chunkShapeTuple, + rec.dtype, + new Scalar(targetByteOffset, "int"), + ); + const gView = this.ctx.tensorCreateView( + gpu_arr, + chunkShapeTuple, + rec.dtype, + new Scalar(targetByteOffset, "int"), + ); + return [ + this.detachFromCurrentScope(cView), + this.detachFromCurrentScope(gView), + ]; + }); + try { + gpuView.copyFrom(cpuView); + } finally { + cpuView.dispose(); + gpuView.dispose(); + } + } + } await device.sync(); this.tensorCacheUpdate(rec.name, gpu_arr, false); cpu_arr.dispose(); @@ -2258,6 +2362,28 @@ export class Instance implements Disposable { case TypeIndex.kTVMFFIOpaquePtr: { return this.memory.loadPointer(valuePtr); } + case TypeIndex.kTVMFFIShape: { + const shapeObjPtr = this.memory.loadPointer(valuePtr); + if (shapeObjPtr === 0) { + return null; + } + if (callbackArg) { + const shapeCellPtr = shapeObjPtr + SizeOf.ObjectHeader; + const shapeDataPtr = this.memory.loadPointer(shapeCellPtr); + const shapeLen = this.memory.loadUSize(shapeCellPtr + this.memory.sizeofPtr()); + const result = new Array(shapeLen); + for (let i = 0; i < shapeLen; ++i) { + result[i] = this.memory.loadI64(shapeDataPtr + i * SizeOf.I64); + } + this.lib.checkCall( + (this.lib.exports.TVMFFIObjectDecRef as ctypes.FTVMFFIObjectDecRef)(shapeObjPtr) + ); + return result; + } + return this.ctx.attachToCurrentScope( + new TVMObject(shapeObjPtr, this.lib, this.ctx) + ); + } case TypeIndex.kTVMFFITensor: { return this.ctx.attachToCurrentScope( new Tensor(this.memory.loadPointer(valuePtr), this.lib, this.ctx, false)