Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions web/emcc/wasm_runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,20 +130,29 @@ void ArrayDecodeStorage(Tensor cpu_arr, TVMFFIByteArray* bytes, const std::strin
const char* byte_data = bytes->data;
const size_t byte_size = bytes->size;
if (format == "f32-to-bf16" && dtype == "float32") {
const uint16_t* bf16 = reinterpret_cast<const uint16_t*>(byte_data);
uint32_t* data = static_cast<uint32_t*>(cpu_arr->data);
TVM_FFI_ICHECK(cpu_arr.IsContiguous());
size_t size = 1;
for (int i = 0; i < cpu_arr->ndim; ++i) {
size *= cpu_arr->shape[i];
}
TVM_FFI_ICHECK_EQ(size, byte_size / 2);
for (size_t i = 0; i < size; ++i) {
data[i] = static_cast<uint32_t>(bf16[i]) << 16;
// The "f32-to-bf16" format encodes a float32 tensor as packed bf16 (2
// bytes per element). When the byte_size matches that expectation, expand
// back to f32. If the byte_size matches the native float32 width
// (4 bytes per element), the payload is already raw float32; fall through
// to the generic byte copy. This makes the loader tolerant of weight
// shards produced by older / alternate quantisation pipelines that retain
// the "f32-to-bf16" tag without performing the bf16 truncation.
if (size == byte_size / 2) {
const uint16_t* bf16 = reinterpret_cast<const uint16_t*>(byte_data);
uint32_t* data =
reinterpret_cast<uint32_t*>(static_cast<char*>(cpu_arr->data) + cpu_arr->byte_offset);
for (size_t i = 0; i < size; ++i) {
data[i] = static_cast<uint32_t>(bf16[i]) << 16;
}
return;
}
} else {
cpu_arr.CopyFromBytes(byte_data, byte_size);
}
cpu_arr.CopyFromBytes(byte_data, byte_size);
}

TVM_FFI_STATIC_INIT_BLOCK() {
Expand Down
130 changes: 127 additions & 3 deletions web/src/runtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1323,6 +1323,23 @@ export class Instance implements Disposable {
artifactCache: ArtifactCacheTemplate,
signal?: AbortSignal,
) {
// Avoid a single JS-to-wasm byte-array call for multi-hundred-MiB
// tensor-cache records. The cap is a conservative per-call staging size,
// independent of the final tensor allocation size. Smaller records keep
// the existing full-record path.
const maxChunkBytes = 128 * 1024 * 1024;

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is the number determined, would be good to have a sense of what WebGPU runtime supports, the motivation here is not as clear

const storageBytes = (dtype: string) => {
if (dtype === "bool") {
return 1;
}
const match = dtype.match(/(\d+)(?:x(\d+))?$/);
if (match === null) {
throw new Error("Cannot determine storage width of dtype " + dtype);
}
const bits = Number(match[1]);
const lanes = match[2] === undefined ? 1 : Number(match[2]);
return (bits * lanes + 7) >> 3;
};
Comment on lines +1331 to +1342

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The storageBytes helper uses a regular expression to extract the bit width from the dtype string. However, if the dtype is "bool", the regex will fail to match any digits and throw an error. In TVM, boolean tensors are commonly represented with the "bool" dtype (which is 1 byte / 8 bits in DLPack). Adding an explicit fallback for "bool" makes the helper more robust.

Suggested change
const storageBytes = (dtype: string) => {
const match = dtype.match(/(\d+)(?:x(\d+))?$/);
if (match === null) {
throw new Error("Cannot determine storage width of dtype " + dtype);
}
const bits = Number(match[1]);
const lanes = match[2] === undefined ? 1 : Number(match[2]);
return (bits * lanes + 7) >> 3;
};
const storageBytes = (dtype: string) => {
if (dtype === "bool") {
return 1;
}
const match = dtype.match(/(\d+)(?:x(\d+))?$/);
if (match === null) {
throw new Error("Cannot determine storage width of dtype " + dtype);
}
const bits = Number(match[1]);
const lanes = match[2] === undefined ? 1 : Number(match[2]);
return (bits * lanes + 7) >> 3;
};

const perf = compact.getPerformance();
const tstart = perf.now();
let totalBytes = 0;
Expand Down Expand Up @@ -1421,9 +1438,59 @@ export class Instance implements Disposable {
this.empty(rec.shape, rec.dtype, this.cpu())
)
});
const recSource = buffer.slice(rec.byteOffset, rec.byteOffset + rec.nbytes);
const shardBytes = buffer instanceof Uint8Array ? buffer : new Uint8Array(buffer);
const recSource =
rec.byteOffset === 0 && rec.nbytes === shardBytes.byteLength
? shardBytes
: shardBytes.subarray(rec.byteOffset, rec.byteOffset + rec.nbytes);
const canChunkRecord =
rec.nbytes > maxChunkBytes &&
rec.shape.length >= 1 &&
Number.isInteger(rec.shape[0]) &&
rec.shape[0] > 0 &&
rec.nbytes % rec.shape[0] === 0;
const outerDim = canChunkRecord ? rec.shape[0] : 1;
const sourceStrideBytes = canChunkRecord ? rec.nbytes / outerDim : rec.nbytes;
const targetBytes = rec.shape.reduce((acc, value) => acc * value, 1) *
storageBytes(rec.dtype);
const targetStrideBytes = canChunkRecord ? targetBytes / outerDim : targetBytes;
const copyRecordToTensor = (targetTensor: Tensor, sourceBytes: Uint8Array) => {
if (!canChunkRecord) {
this.ctx.arrayDecodeStorage(targetTensor, sourceBytes, rec.format, rec.dtype);
return;
}
const chunkOuterDim = Math.max(1, Math.floor(maxChunkBytes / sourceStrideBytes));
for (let outerOffset = 0; outerOffset < outerDim; outerOffset += chunkOuterDim) {
const outerCount = Math.min(chunkOuterDim, outerDim - outerOffset);
const sourceByteOffset = outerOffset * sourceStrideBytes;
const targetByteOffset = outerOffset * targetStrideBytes;
const chunkBytes = outerCount * sourceStrideBytes;
const chunkShape = rec.shape.slice();
chunkShape[0] = outerCount;
const chunkShapeTuple = this.makeShapeTuple(chunkShape);
const chunkView = this.withNewScope(() => {
return this.detachFromCurrentScope(
this.ctx.tensorCreateView(
targetTensor,
chunkShapeTuple,
rec.dtype,
new Scalar(targetByteOffset, "int"),
)
);
});
const chunkSource = sourceBytes.subarray(
sourceByteOffset,
sourceByteOffset + chunkBytes,
);
try {
this.ctx.arrayDecodeStorage(chunkView, chunkSource, rec.format, rec.dtype);
} finally {
chunkView.dispose();
}
}
};
// first sync copy to cpu.
this.ctx.arrayDecodeStorage(cpu_arr, new Uint8Array(recSource), rec.format, rec.dtype);
copyRecordToTensor(cpu_arr, recSource);
// then async stream into GPU if needed
if (device.deviceType === DeviceStrToEnum.cpu) {
this.tensorCacheUpdate(rec.name, cpu_arr, false);
Expand All @@ -1435,7 +1502,42 @@ export class Instance implements Disposable {
this.empty(rec.shape, rec.dtype, device)
)
});
gpu_arr.copyFrom(cpu_arr);
if (!canChunkRecord) {
gpu_arr.copyFrom(cpu_arr);
} else {
const chunkOuterDim = Math.max(1, Math.floor(maxChunkBytes / sourceStrideBytes));
for (let outerOffset = 0; outerOffset < outerDim; outerOffset += chunkOuterDim) {
const outerCount = Math.min(chunkOuterDim, outerDim - outerOffset);
const targetByteOffset = outerOffset * targetStrideBytes;
const chunkShape = rec.shape.slice();
chunkShape[0] = outerCount;
const chunkShapeTuple = this.makeShapeTuple(chunkShape);
const [cpuView, gpuView] = this.withNewScope(() => {
const cView = this.ctx.tensorCreateView(
cpu_arr,
chunkShapeTuple,
rec.dtype,
new Scalar(targetByteOffset, "int"),
);
const gView = this.ctx.tensorCreateView(
gpu_arr,
chunkShapeTuple,
rec.dtype,
new Scalar(targetByteOffset, "int"),
);
return [
this.detachFromCurrentScope(cView),
this.detachFromCurrentScope(gView),
];
});
Comment on lines +1515 to +1532

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In the GPU copy path, cpuView and gpuView are created and immediately detached from the current scope inside the withNewScope block. If the second tensorCreateView (for gpuView) throws an exception, the first view (cpuView) will have already been detached from the scope's auto-disposal list, but the assignment to [cpuView, gpuView] will never complete. This results in a resource leak of cpuView because it is neither auto-disposed by the scope nor disposed in the finally block. To prevent this, detach both views only after both have been successfully created.

                const [cpuView, gpuView] = this.withNewScope(() => {
                  const cView = this.ctx.tensorCreateView(
                    cpu_arr,
                    chunkShapeTuple,
                    rec.dtype,
                    new Scalar(targetByteOffset, "int"),
                  );
                  const gView = this.ctx.tensorCreateView(
                    gpu_arr,
                    chunkShapeTuple,
                    rec.dtype,
                    new Scalar(targetByteOffset, "int"),
                  );
                  return [
                    this.detachFromCurrentScope(cView),
                    this.detachFromCurrentScope(gView),
                  ];
                });

try {
gpuView.copyFrom(cpuView);
} finally {
cpuView.dispose();
gpuView.dispose();
}
}
}
await device.sync();
this.tensorCacheUpdate(rec.name, gpu_arr, false);
cpu_arr.dispose();
Expand Down Expand Up @@ -2258,6 +2360,28 @@ export class Instance implements Disposable {
case TypeIndex.kTVMFFIOpaquePtr: {
return this.memory.loadPointer(valuePtr);
}
case TypeIndex.kTVMFFIShape: {
const shapeObjPtr = this.memory.loadPointer(valuePtr);
if (shapeObjPtr === 0) {
return null;
}
if (callbackArg) {
Comment on lines +2363 to +2368

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Add a defensive null check for shapeObjPtr before performing pointer arithmetic and loading memory. If shapeObjPtr is null (0), accessing it will cause an out-of-bounds memory access or return garbage.

Suggested change
case TypeIndex.kTVMFFIShape: {
const shapeObjPtr = this.memory.loadPointer(valuePtr);
if (callbackArg) {
case TypeIndex.kTVMFFIShape: {
const shapeObjPtr = this.memory.loadPointer(valuePtr);
if (shapeObjPtr === 0) {
return null;
}
if (callbackArg) {

const shapeCellPtr = shapeObjPtr + SizeOf.ObjectHeader;
const shapeDataPtr = this.memory.loadPointer(shapeCellPtr);
const shapeLen = this.memory.loadUSize(shapeCellPtr + this.memory.sizeofPtr());
const result = new Array<number>(shapeLen);
for (let i = 0; i < shapeLen; ++i) {
result[i] = this.memory.loadI64(shapeDataPtr + i * SizeOf.I64);
}
this.lib.checkCall(
(this.lib.exports.TVMFFIObjectDecRef as ctypes.FTVMFFIObjectDecRef)(shapeObjPtr)
);
return result;
}
return this.ctx.attachToCurrentScope(
new TVMObject(shapeObjPtr, this.lib, this.ctx)
);
}
case TypeIndex.kTVMFFITensor: {
return this.ctx.attachToCurrentScope(
new Tensor(this.memory.loadPointer(valuePtr), this.lib, this.ctx, false)
Expand Down