From 95c38cfaa6764f185d97b062af74cac4e6535fc0 Mon Sep 17 00:00:00 2001 From: Jakub Chmura <92989966+chmjkb@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:23:40 +0200 Subject: [PATCH 1/5] feat: add RF-DETR keypoint preview model (#1257) Register the RF-DETR keypoint preview pose model with xnnpack, coreml and mlx backends (all fp32). This is a beta preview export and may be re-exported under a different constant once a stable version ships. - modelUrls/modelRegistry: add the three backend URLs and variant map - PoseEstimationModule/types: register the model config (single-`forward` export, no inputSize axis) and extend PoseEstimationModelSources - demo: load it via usePoseEstimation in the pose estimation screen - docs: list it in the model registry and usePoseEstimation supported models ## Description ### Introduces a breaking change? - [ ] Yes - [ ] No ### Type of change - [ ] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [ ] iOS - [ ] Android ### Testing instructions ### Screenshots ### Related issues ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes --------- Co-authored-by: Claude Opus 4.8 (1M context) --- .../app/pose_estimation/index.tsx | 28 +++++++++++++++++-- .../02-computer-vision/usePoseEstimation.md | 11 ++++++-- docs/docs/05-utilities/model-registry.md | 2 +- .../src/constants/modelRegistry.ts | 28 +++++++++++++++++++ .../src/constants/modelUrls.ts | 22 +++++++++++++++ .../computer_vision/PoseEstimationModule.ts | 12 ++++++++ .../src/types/poseEstimation.ts | 14 +++++++--- 7 files changed, 107 insertions(+), 10 deletions(-) diff --git a/apps/computer-vision/app/pose_estimation/index.tsx b/apps/computer-vision/app/pose_estimation/index.tsx index 4546b628fa..8e16693f6a 100644 --- a/apps/computer-vision/app/pose_estimation/index.tsx +++ b/apps/computer-vision/app/pose_estimation/index.tsx @@ -1,10 +1,12 @@ import Spinner from '../../components/Spinner'; import { BottomBar } from '../../components/BottomBar'; +import { ModelPicker, ModelOption } from '../../components/ModelPicker'; import { getImage } from '../../utils'; import { models, usePoseEstimation, PoseDetections, + PoseEstimationModelSources, RnExecutorchError, RnExecutorchErrorCode, } from 'react-native-executorch'; @@ -17,6 +19,16 @@ import Svg, { Circle, Line } from 'react-native-svg'; import ErrorBanner from '../../components/ErrorBanner'; import { COCO_SKELETON_CONNECTIONS } from '../../components/utils/cocoSkeleton'; +const poseEstimation = models.pose_estimation; + +const MODELS: ModelOption[] = [ + { label: 'YOLO26N Pose', value: poseEstimation.yolo26n() }, + { + label: 'RF-DETR Keypoint (beta)', + value: poseEstimation.rfdetr_keypoint_preview(), + }, +]; + // Colors for different people const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink']; @@ -30,8 +42,10 @@ export default function PoseEstimationScreen() { }>(); const [inferenceTime, setInferenceTime] = useState(null); const [layout, setLayout] = useState({ width: 0, height: 0 }); + const [selectedModel, setSelectedModel] = + useState(poseEstimation.yolo26n()); - const model = usePoseEstimation({ model: models.pose_estimation.yolo26n() }); + const model = usePoseEstimation({ model: selectedModel }); const { setGlobalGenerating } = useContext(GeneratingContext); useEffect(() => { @@ -60,7 +74,7 @@ export default function PoseEstimationScreen() { if (imageUri) { try { const start = Date.now(); - const output = await model.forward(imageUri, { inputSize: 384 }); + const output = await model.forward(imageUri); setInferenceTime(Date.now() - start); setResults(output); } catch (e) { @@ -206,6 +220,16 @@ export default function PoseEstimationScreen() { )} + { + setSelectedModel(m); + setResults([]); + setInferenceTime(null); + }} + /> 0 ? results.length : null} diff --git a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md index 465a0ab6a6..eb3d7a5368 100644 --- a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md +++ b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md @@ -129,10 +129,15 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md). ## Supported models -| Model | Number of keypoints | Keypoint list | Multi-size Support | -| ------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ | -| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) | +| Model | Number of keypoints | Keypoint list | Multi-size Support | +| ------------------------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ | +| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) | +| [RF-DETR Keypoint (preview)](https://huggingface.co/software-mansion/react-native-executorch-rfdetr-keypoint) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | No | :::tip YOLO models support multiple input sizes (384px, 512px, 640px). Smaller sizes are faster but less accurate, while larger sizes are more accurate but slower. Choose based on your speed/accuracy requirements. ::: + +:::warning +`rfdetr_keypoint_preview` is a **preview weights** export and may be re-exported under a different constant once a stable version ships. It is a single-input-size model (no `inputSize` option) and ships `xnnpack`, `coreml`, and `mlx` backends — pass `{ backend }` to override the platform default, e.g. `models.pose_estimation.rfdetr_keypoint_preview({ backend: 'mlx' })`. +::: diff --git a/docs/docs/05-utilities/model-registry.md b/docs/docs/05-utilities/model-registry.md index 3611731235..1f135ea43c 100644 --- a/docs/docs/05-utilities/model-registry.md +++ b/docs/docs/05-utilities/model-registry.md @@ -26,7 +26,7 @@ Each leaf is a **function**. Call it (optionally with `{ quant, backend }`) to g | `classification` | `efficientnet_v2_s` | | `privacy_filter` | `openai`, `nemotron` | | `object_detection` | `ssdlite_320_mobilenet_v3_large`, `yolo26n` … `yolo26x`, `rf_detr_nano` | -| `pose_estimation` | `yolo26n` | +| `pose_estimation` | `yolo26n`, `rfdetr_keypoint_preview` _(beta)_ | | `semantic_segmentation` | `deeplab_v3_resnet50`, `lraspp_mobilenet_v3_large`, `fcn_resnet101`, `selfie_segmentation`, … | | `instance_segmentation` | `yolo26n` … `yolo26x`, `rf_detr_nano`, `fastsam_s`, `fastsam_x` | | `style_transfer` | `candy`, `mosaic`, `rain_princess`, `udnie` | diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts index c3cda78498..a7e880c1a7 100644 --- a/packages/react-native-executorch/src/constants/modelRegistry.ts +++ b/packages/react-native-executorch/src/constants/modelRegistry.ts @@ -276,6 +276,31 @@ const RF_DETR_NANO_SEG_VARIANTS = { }, }; +// RF-DETR Keypoint (pose estimation) — BETA preview. Configs mirror the +// All three backends ship fp32 +// (non-quantized); this entry may be re-exported under a different constant +// once more RF-DETR keypoint weights are released. +const RF_DETR_KEYPOINT_PREVIEW_VARIANTS = { + xnnpack: { + base: { + modelName: 'rfdetr-keypoint-preview' as const, + modelSource: M.RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL, + }, + }, + coreml: { + base: { + modelName: 'rfdetr-keypoint-preview' as const, + modelSource: M.RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL, + }, + }, + mlx: { + base: { + modelName: 'rfdetr-keypoint-preview' as const, + modelSource: M.RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL, + }, + }, +}; + const FASTSAM_S_VARIANTS = { xnnpack: { base: { @@ -553,6 +578,9 @@ export const models = { }, pose_estimation: { yolo26n: base(M.YOLO26N_POSE), + // BETA preview — may be re-exported under a different constant once a + // stable RF-DETR keypoint model ships. + rfdetr_keypoint_preview: variant(RF_DETR_KEYPOINT_PREVIEW_VARIANTS), }, semantic_segmentation: { deeplab_v3_resnet50: pair( diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 3f26537acc..8827722b9e 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -731,6 +731,28 @@ export const YOLO26N_POSE = { modelSource: YOLO26N_POSE_MODEL, } as const; +// RF-DETR Keypoint (pose estimation) — BETA preview. +// NOTE: served from the `preview/` path under PREVIOUS_VERSION_TAG (shipping as +// part of a patch release). This export is a preview and may be re-exported +// under a different constant once a stable version ships. +export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`; +export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`; +export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`; +const RF_DETR_KEYPOINT_PREVIEW_MODEL = + Platform.OS === 'ios' + ? RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL + : RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL; + +/** + * @category Models - Pose Estimation + * @beta Preview export — may be re-exported under a different constant once a + * stable RF-DETR keypoint model ships. + */ +export const RF_DETR_KEYPOINT_PREVIEW = { + modelName: 'rfdetr-keypoint-preview', + modelSource: RF_DETR_KEYPOINT_PREVIEW_MODEL, +} as const; + // Style transfer /** * Builds the four `(backend, precision)` URLs for a single style-transfer style. diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index ff2b68b1fd..34ddf45952 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -29,8 +29,20 @@ const YOLO_POSE_CONFIG = { defaultKeypointThreshold: 0.5, } satisfies PoseEstimationConfig; +// RF-DETR keypoint preview (BETA). Unlike yolo26n-pose's multi-method +// `forward_` export, this ships a single `forward` method — omitting +// availableInputSizes/defaultInputSize makes forward() dispatch to plain +// `forward`. May be renamed once a stable model ships. +const RFDETR_KEYPOINT_CONFIG = { + keypointMap: CocoKeypoint, + preprocessorConfig: undefined, + defaultDetectionThreshold: 0.5, + defaultKeypointThreshold: 0.5, +} satisfies PoseEstimationConfig; + const ModelConfigs = { 'yolo26n-pose': YOLO_POSE_CONFIG, + 'rfdetr-keypoint-preview': RFDETR_KEYPOINT_CONFIG, } as const satisfies Record< PoseEstimationModelName, PoseEstimationConfig diff --git a/packages/react-native-executorch/src/types/poseEstimation.ts b/packages/react-native-executorch/src/types/poseEstimation.ts index 03afc592c3..c7ae352925 100644 --- a/packages/react-native-executorch/src/types/poseEstimation.ts +++ b/packages/react-native-executorch/src/types/poseEstimation.ts @@ -62,10 +62,16 @@ export type PoseEstimationConfig = { * Each model name maps to its required fields. * @category Types */ -export type PoseEstimationModelSources = { - modelName: 'yolo26n-pose'; - modelSource: ResourceSource; -}; +export type PoseEstimationModelSources = + | { + modelName: 'yolo26n-pose'; + modelSource: ResourceSource; + } + // RF-DETR keypoint preview (BETA) — may be renamed once a stable model ships. + | { + modelName: 'rfdetr-keypoint-preview'; + modelSource: ResourceSource; + }; /** * Union of all built-in pose estimation model names. From 04b21610980111c8aa2dd4aef5cb90d768c2c7b8 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Wed, 17 Jun 2026 16:26:57 +0200 Subject: [PATCH 2/5] chore: bump version in package.json --- packages/react-native-executorch/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/react-native-executorch/package.json b/packages/react-native-executorch/package.json index cfbf1361a8..ab7a8e8e9d 100644 --- a/packages/react-native-executorch/package.json +++ b/packages/react-native-executorch/package.json @@ -1,6 +1,6 @@ { "name": "react-native-executorch", - "version": "0.9.1", + "version": "0.9.2", "description": "An easy way to run AI models in React Native with ExecuTorch", "source": "./src/index.ts", "main": "./lib/module/index.js", From 6959fe045c6fa2ef92833aad94b794e12a5ddd90 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Wed, 17 Jun 2026 16:43:00 +0200 Subject: [PATCH 3/5] chore: replace model url to point to 0.9 --- packages/react-native-executorch/src/constants/modelUrls.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 8827722b9e..64aca10c6a 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -735,9 +735,9 @@ export const YOLO26N_POSE = { // NOTE: served from the `preview/` path under PREVIOUS_VERSION_TAG (shipping as // part of a patch release). This export is a preview and may be re-exported // under a different constant once a stable version ships. -export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`; -export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`; -export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`; +export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`; +export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`; +export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`; const RF_DETR_KEYPOINT_PREVIEW_MODEL = Platform.OS === 'ios' ? RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL From aea5db0805f6755e94953ef2b3025166548b84f8 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 11 Jun 2026 21:49:06 +0200 Subject: [PATCH 4/5] fix(llm): snapshot vision encoder output before caching (#1229) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description In any multimodal conversation with more than one image, the model starts describing earlier images as the most recently sent one on later turns. `VisionEncoder::encode` caches the `EValue` returned by `vision_encoder.execute()` per image path. That tensor aliases the method's reusable output buffer, so the next `execute()` (the second image, or any later encode) overwrites the bytes behind every cached entry. On re-prefilled turns the prefiller then splices the latest image's embeddings into every image slot. The audio path already snapshots its encoder output for exactly this reason (see the `AudioSlot` comment in `multimodal_prefiller.cpp`); vision never got the same treatment. The fix copies the encoder output into bytes owned by the cache entry immediately after `execute()` and serves cache hits from a tensor wrapping those owned bytes (`unordered_map` nodes are pointer-stable, so the blob stays valid). The bug is backend-independent (the cache sits above the delegate), so XNNPACK/Vulkan multimodal models are affected the same way. ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [x] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [ ] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [ ] Android ### Testing instructions 1. Run the example LLM app with a multimodal model (e.g. Gemma 4 E2B multimodal) on the Multimodal LLM screen. 2. Send image A with "What's in this picture?" — answer is correct. 3. Send image B (different content) with the same question — answer is correct. 4. Ask "What was in the FIRST picture I sent?". Before this fix, step 4 describes image B's content (both image slots receive B's embeddings on the re-prefilled turn). After the fix, the model correctly recalls image A. ### Screenshots N/A ### Related issues N/A ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [x] My changes generate no new warnings ### Additional notes Co-authored-by: Claude Opus 4.6 (1M context) --- .../common/runner/encoders/vision_encoder.cpp | 17 +++++++++++++---- .../common/runner/encoders/vision_encoder.h | 17 +++++++++++++++-- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 59fee53e11..09fb459661 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -112,7 +112,7 @@ Result VisionEncoder::encode(const MultimodalInput &input) { auto it = embedding_cache_.find(path); if (it != embedding_cache_.end()) { - return it->second; + return EValue(*it->second.tensor); } auto shape = ET_UNWRAP(getInputShape()); @@ -128,9 +128,18 @@ Result VisionEncoder::encode(const MultimodalInput &input) { chw.data(), sizes, ::executorch::aten::ScalarType::Float); auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); - auto embedding = result[0]; - embedding_cache_.emplace(path, embedding); - return embedding; + auto out_tensor = result[0].toTensor(); + + CachedEmbedding cached; + cached.bytes.resize(out_tensor.nbytes()); + std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(), + out_tensor.nbytes()); + cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end()); + cached.dtype = out_tensor.scalar_type(); + auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached)); + entry->second.tensor = ::executorch::extension::from_blob( + entry->second.bytes.data(), entry->second.sizes, entry->second.dtype); + return EValue(*entry->second.tensor); } } // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h index bb8a8421b9..54d43bb869 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -2,11 +2,14 @@ #pragma once #include "iencoder.h" +#include #include +#include #include #include #include #include +#include namespace executorch::extension::llm { @@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder { bool with_batch; }; + // The method's output EValue aliases the runtime's reusable output buffer, + // which the NEXT vision_encoder.execute() overwrites — caching it directly + // silently turns earlier images into the most recently encoded one. Cache + // an owned byte snapshot instead and hand out a tensor over those bytes. + struct CachedEmbedding { + std::vector bytes; + std::vector<::executorch::aten::SizesType> sizes; + ::executorch::aten::ScalarType dtype; + ::executorch::extension::TensorPtr tensor; + }; + ::executorch::runtime::Result getInputShape() const; std::vector preprocessImage(const std::string &path, const ImageShape &targetShape) const; ::executorch::extension::Module *module_; - std::unordered_map - embedding_cache_; + std::unordered_map embedding_cache_; }; } // namespace executorch::extension::llm From 64b68deb229e1c03e0213252bc7c013deadde2d4 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 15 Jun 2026 20:13:32 +0200 Subject: [PATCH 5/5] perf: speed up top-p sampling for large vocabularies (#1232) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Optimizes token sampling for large-vocabulary models (e.g. Gemma 4 E2B, 262k vocab), where the previous full-vocabulary sort in top-p dominated per-token latency. Two changes in `sampler.cpp`: - **`mask_topp`**: replaces the `O(n log n)` sort over all logits with a logit-space histogram (`kBins=2048`) that locates the nucleus threshold in two `O(n)` passes — no sort, no per-token vocab-sized allocation. Binning in logit space (rather than probability space) keeps uniform resolution for both peaked and flat distributions. - **`softmax`**: skips `exp()` on logits already masked to `lowest()` by top-k/top-p. The result underflows to zero anyway, and the call is slow on device. On an iPhone 17 Pro with Gemma 4 E2B (int4), per-token sampling drops from ~45 ms to ~10 ms. The histogram approximates the exact sort-based nucleus; the resulting sampled distribution is statistically equivalent (verified the kept-mass fraction stays within <1% of the exact nucleus across peaked, flat, and sharp distributions). ### Introduces a breaking change? - [ ] Yes - [x] No ### Type of change - [ ] Bug fix (change which fixes an issue) - [ ] New feature (change which adds functionality) - [ ] Documentation update (improves or adds clarity to existing documentation) - [x] Other (chores, tests, code style improvements etc.) ### Tested on - [x] iOS - [ ] Android ### Testing instructions 1. Run an LLM with a large vocabulary and a non-zero temperature with `topP` set (e.g. Gemma 4 E2B with `temperature: 0.8`, `topP: 0.9`). 2. Generate a long response and observe tokens/sec. 3. Confirm output remains coherent and sampling is unchanged in character (still stochastic, not greedy). Greedy decoding (`temperature: 0`) is unaffected — it bypasses this path entirely. ### Screenshots ### Related issues ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [x] My changes generate no new warnings ### Additional notes The histogram is an approximation bounded by bin granularity (`kBins=2048` over a `kRange=40` logit span). This is intentional: exact top-p over a 262k vocab where the nucleus can exceed 100k tokens is inherently expensive, and the sampling outcome is statistically indistinguishable from the exact version. --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .../common/runner/sampler.cpp | 93 +++++++++++-------- 1 file changed, 52 insertions(+), 41 deletions(-) diff --git a/packages/react-native-executorch/common/runner/sampler.cpp b/packages/react-native-executorch/common/runner/sampler.cpp index 250d6a83ef..8215c49be5 100644 --- a/packages/react-native-executorch/common/runner/sampler.cpp +++ b/packages/react-native-executorch/common/runner/sampler.cpp @@ -36,6 +36,9 @@ #include #include #include +#include +#include +#include #include namespace executorch { @@ -141,57 +144,59 @@ template void Sampler::mask_topk(T *logits) { } } -// Mask logits whose softmax-prob falls outside the top-p nucleus to -inf. -// Keeps the token that crosses the threshold (HuggingFace convention). +// Mask logits outside the top-p nucleus to -inf. Approximates the exact +// sort-based nucleus with a histogram over (logit - max): two O(n) passes, no +// sort. Binning in logit (not probability) space keeps uniform resolution for +// peaked and flat distributions alike. kRange=40 spans exp() down to ~4e-18. template void Sampler::mask_topp(T *logits) { if (topp_ <= 0.0f || topp_ >= 1.0f) { return; } - // Softmax into a scratch probs[] (do not mutate logits yet). - T max_val = logits[0]; - for (size_t i = 1; i < vocab_size_; i++) { - if (logits[i] > max_val) { - max_val = logits[i]; - } - } - std::unique_ptr[]> probindex = - std::make_unique[]>(vocab_size_); - T sum = 0; + constexpr int32_t kBins = 2048; + // Compute in a type at least as wide as T so converting logits never loses + // precision: double stays double, everything else (float and the narrow + // half/bf16/uint16 logit types) widens to float. Accumulating in T directly + // would be unsafe for bf16, whose mantissa saturates when summing exp() + // over the full vocab. + using acc_t = std::conditional_t, double, float>; + constexpr acc_t kRange = 40; + + std::span logit_span{logits, static_cast(vocab_size_)}; + const acc_t max_val = + static_cast(*std::ranges::max_element(logit_span)); + + std::vector bin_mass(kBins, acc_t(0)); + acc_t total = 0; for (size_t i = 0; i < vocab_size_; i++) { - T e = static_cast(std::expf(static_cast(logits[i] - max_val))); - probindex[i].prob = e; - probindex[i].index = i; - sum += e; + acc_t d = static_cast(logits[i]) - max_val; + acc_t e = std::exp(d); + total += e; + int32_t bin = static_cast((d + kRange) / kRange * kBins); + bin = std::clamp(bin, 0, kBins - 1); + bin_mass[bin] += e; } - if (sum <= T(0)) { + if (total <= acc_t(0)) { return; } - for (size_t i = 0; i < vocab_size_; i++) { - probindex[i].prob /= sum; - } - std::sort(probindex.get(), probindex.get() + vocab_size_, - [](const ProbIndex &a, const ProbIndex &b) { - return a.prob > b.prob; - }); - // Find the smallest prefix whose cumulative probability >= topp_. - T cumulative = 0; - int last_idx = vocab_size_ - 1; - for (size_t i = 0; i < vocab_size_; i++) { - cumulative += probindex[i].prob; - if (static_cast(cumulative) >= topp_) { - last_idx = i; + // Highest bin downward until the kept mass reaches topp. The crossing bin is + // kept (HuggingFace "keep the token that crosses" convention). + const acc_t target = static_cast(topp_) * total; + acc_t acc = 0; + int32_t keep_bin = 0; + for (int32_t bin = kBins - 1; bin >= 0; --bin) { + acc += bin_mass[bin]; + if (acc >= target) { + keep_bin = bin; break; } } - // Mark kept indices, then -inf the rest. - std::vector keep(vocab_size_, false); - for (size_t i = 0; i <= last_idx; i++) { - keep[probindex[i].index] = true; - } + const acc_t d_threshold = + static_cast(keep_bin) / kBins * kRange - kRange; + constexpr T neg_inf = std::numeric_limits::lowest(); for (size_t i = 0; i < vocab_size_; i++) { - if (!keep[i]) { + if (static_cast(logits[i]) - max_val < d_threshold) { logits[i] = neg_inf; } } @@ -210,22 +215,28 @@ Sampler::Sampler(int32_t vocab_size, GenerationConfig config) : Sampler(vocab_size, config, std::time(nullptr)) {} template static void softmax(T *x, int size) { - // find max value (for numerical stability) + // Runs after top-k/top-p masking, which sets rejected logits to lowest(). + // Skip exp() on those: it underflows to 0 anyway and is slow on device. + constexpr T kMasked = std::numeric_limits::lowest(); T max_val = x[0]; for (size_t i = 1; i < size; i++) { if (x[i] > max_val) { max_val = x[i]; } } - // exp and sum T sum = 0; for (size_t i = 0; i < size; i++) { + if (x[i] == kMasked) { + x[i] = T(0); + continue; + } x[i] = expf(x[i] - max_val); sum += x[i]; } - // normalize for (size_t i = 0; i < size; i++) { - x[i] /= sum; + if (x[i] != T(0)) { + x[i] /= sum; + } } }