From 95c38cfaa6764f185d97b062af74cac4e6535fc0 Mon Sep 17 00:00:00 2001
From: Jakub Chmura <92989966+chmjkb@users.noreply.github.com>
Date: Wed, 17 Jun 2026 16:23:40 +0200
Subject: [PATCH 1/5] feat: add RF-DETR keypoint preview model (#1257)

Register the RF-DETR keypoint preview pose model with xnnpack, coreml
and mlx backends (all fp32). This is a beta preview export and may be
re-exported under a different constant once a stable version ships.

- modelUrls/modelRegistry: add the three backend URLs and variant map
- PoseEstimationModule/types: register the model config
(single-`forward` export, no inputSize axis) and extend
PoseEstimationModelSources
- demo: load it via usePoseEstimation in the pose estimation screen
- docs: list it in the model registry and usePoseEstimation supported
models

## Description

<!-- Provide a concise and descriptive summary of the changes
implemented in this PR. -->

### Introduces a breaking change?

- [ ] Yes
- [ ] No

### Type of change

- [ ] Bug fix (change which fixes an issue)
- [ ] New feature (change which adds functionality)
- [ ] Documentation update (improves or adds clarity to existing
documentation)
- [ ] Other (chores, tests, code style improvements etc.)

### Tested on

- [ ] iOS
- [ ] Android

### Testing instructions

<!-- Provide step-by-step instructions on how to test your changes.
Include setup details if necessary. -->

### Screenshots

<!-- Add screenshots here, if applicable -->

### Related issues

<!-- Link related issues here using #issue-number -->

### Checklist

- [ ] I have performed a self-review of my code
- [ ] I have commented my code, particularly in hard-to-understand areas
- [ ] I have updated the documentation accordingly
- [ ] My changes generate no new warnings

### Additional notes

<!-- Include any additional information, assumptions, or context that
reviewers might need to understand this PR. -->

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../app/pose_estimation/index.tsx             | 28 +++++++++++++++++--
 .../02-computer-vision/usePoseEstimation.md   | 11 ++++++--
 docs/docs/05-utilities/model-registry.md      |  2 +-
 .../src/constants/modelRegistry.ts            | 28 +++++++++++++++++++
 .../src/constants/modelUrls.ts                | 22 +++++++++++++++
 .../computer_vision/PoseEstimationModule.ts   | 12 ++++++++
 .../src/types/poseEstimation.ts               | 14 +++++++---
 7 files changed, 107 insertions(+), 10 deletions(-)
diff --git a/apps/computer-vision/app/pose_estimation/index.tsx b/apps/computer-vision/app/pose_estimation/index.tsx
index 4546b628fa..8e16693f6a 100644
--- a/apps/computer-vision/app/pose_estimation/index.tsx
+++ b/apps/computer-vision/app/pose_estimation/index.tsx
@@ -1,10 +1,12 @@
 import Spinner from '../../components/Spinner';
 import { BottomBar } from '../../components/BottomBar';
+import { ModelPicker, ModelOption } from '../../components/ModelPicker';
 import { getImage } from '../../utils';
 import {
   models,
   usePoseEstimation,
   PoseDetections,
+  PoseEstimationModelSources,
   RnExecutorchError,
   RnExecutorchErrorCode,
 } from 'react-native-executorch';
@@ -17,6 +19,16 @@ import Svg, { Circle, Line } from 'react-native-svg';
 import ErrorBanner from '../../components/ErrorBanner';
 import { COCO_SKELETON_CONNECTIONS } from '../../components/utils/cocoSkeleton';
 
+const poseEstimation = models.pose_estimation;
+
+const MODELS: ModelOption<PoseEstimationModelSources>[] = [
+  { label: 'YOLO26N Pose', value: poseEstimation.yolo26n() },
+  {
+    label: 'RF-DETR Keypoint (beta)',
+    value: poseEstimation.rfdetr_keypoint_preview(),
+  },
+];
+
 // Colors for different people
 const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink'];
 
@@ -30,8 +42,10 @@ export default function PoseEstimationScreen() {
   }>();
   const [inferenceTime, setInferenceTime] = useState<number | null>(null);
   const [layout, setLayout] = useState({ width: 0, height: 0 });
+  const [selectedModel, setSelectedModel] =
+    useState<PoseEstimationModelSources>(poseEstimation.yolo26n());
 
-  const model = usePoseEstimation({ model: models.pose_estimation.yolo26n() });
+  const model = usePoseEstimation({ model: selectedModel });
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
   useEffect(() => {
@@ -60,7 +74,7 @@ export default function PoseEstimationScreen() {
     if (imageUri) {
       try {
         const start = Date.now();
-        const output = await model.forward(imageUri, { inputSize: 384 });
+        const output = await model.forward(imageUri);
         setInferenceTime(Date.now() - start);
         setResults(output);
       } catch (e) {
@@ -206,6 +220,16 @@ export default function PoseEstimationScreen() {
           </View>
         )}
       </View>
+      <ModelPicker
+        models={MODELS}
+        selectedModel={selectedModel}
+        disabled={model.isGenerating}
+        onSelect={(m) => {
+          setSelectedModel(m);
+          setResults([]);
+          setInferenceTime(null);
+        }}
+      />
       <StatsBar
         inferenceTime={inferenceTime}
         detectionCount={results.length > 0 ? results.length : null}
diff --git a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md
index 465a0ab6a6..eb3d7a5368 100644
--- a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md
+++ b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md
@@ -129,10 +129,15 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).
 
 ## Supported models
 
-| Model                                                                                       | Number of keypoints | Keypoint list                                               | Multi-size Support |
-| ------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ |
-| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17                  | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640)  |
+| Model                                                                                                         | Number of keypoints | Keypoint list                                               | Multi-size Support |
+| ------------------------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ |
+| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose)                   | 17                  | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640)  |
+| [RF-DETR Keypoint (preview)](https://huggingface.co/software-mansion/react-native-executorch-rfdetr-keypoint) | 17                  | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | No                 |
 
 :::tip
 YOLO models support multiple input sizes (384px, 512px, 640px). Smaller sizes are faster but less accurate, while larger sizes are more accurate but slower. Choose based on your speed/accuracy requirements.
 :::
+
+:::warning
+`rfdetr_keypoint_preview` is a **preview weights** export and may be re-exported under a different constant once a stable version ships. It is a single-input-size model (no `inputSize` option) and ships `xnnpack`, `coreml`, and `mlx` backends — pass `{ backend }` to override the platform default, e.g. `models.pose_estimation.rfdetr_keypoint_preview({ backend: 'mlx' })`.
+:::
diff --git a/docs/docs/05-utilities/model-registry.md b/docs/docs/05-utilities/model-registry.md
index 3611731235..1f135ea43c 100644
--- a/docs/docs/05-utilities/model-registry.md
+++ b/docs/docs/05-utilities/model-registry.md
@@ -26,7 +26,7 @@ Each leaf is a **function**. Call it (optionally with `{ quant, backend }`) to g
 | `classification`        | `efficientnet_v2_s`                                                                                                                                                                                                                                                                         |
 | `privacy_filter`        | `openai`, `nemotron`                                                                                                                                                                                                                                                                        |
 | `object_detection`      | `ssdlite_320_mobilenet_v3_large`, `yolo26n` … `yolo26x`, `rf_detr_nano`                                                                                                                                                                                                                     |
-| `pose_estimation`       | `yolo26n`                                                                                                                                                                                                                                                                                   |
+| `pose_estimation`       | `yolo26n`, `rfdetr_keypoint_preview` _(beta)_                                                                                                                                                                                                                                               |
 | `semantic_segmentation` | `deeplab_v3_resnet50`, `lraspp_mobilenet_v3_large`, `fcn_resnet101`, `selfie_segmentation`, …                                                                                                                                                                                               |
 | `instance_segmentation` | `yolo26n` … `yolo26x`, `rf_detr_nano`, `fastsam_s`, `fastsam_x`                                                                                                                                                                                                                             |
 | `style_transfer`        | `candy`, `mosaic`, `rain_princess`, `udnie`                                                                                                                                                                                                                                                 |
diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
index c3cda78498..a7e880c1a7 100644
--- a/packages/react-native-executorch/src/constants/modelRegistry.ts
+++ b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -276,6 +276,31 @@ const RF_DETR_NANO_SEG_VARIANTS = {
   },
 };
 
+// RF-DETR Keypoint (pose estimation) — BETA preview. Configs mirror the
+// All three backends ship fp32
+// (non-quantized); this entry may be re-exported under a different constant
+// once more RF-DETR keypoint weights are released.
+const RF_DETR_KEYPOINT_PREVIEW_VARIANTS = {
+  xnnpack: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL,
+    },
+  },
+  coreml: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL,
+    },
+  },
+  mlx: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL,
+    },
+  },
+};
+
 const FASTSAM_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -553,6 +578,9 @@ export const models = {
   },
   pose_estimation: {
     yolo26n: base(M.YOLO26N_POSE),
+    // BETA preview — may be re-exported under a different constant once a
+    // stable RF-DETR keypoint model ships.
+    rfdetr_keypoint_preview: variant(RF_DETR_KEYPOINT_PREVIEW_VARIANTS),
   },
   semantic_segmentation: {
     deeplab_v3_resnet50: pair(
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 3f26537acc..8827722b9e 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -731,6 +731,28 @@ export const YOLO26N_POSE = {
   modelSource: YOLO26N_POSE_MODEL,
 } as const;
 
+// RF-DETR Keypoint (pose estimation) — BETA preview.
+// NOTE: served from the `preview/` path under PREVIOUS_VERSION_TAG (shipping as
+// part of a patch release). This export is a preview and may be re-exported
+// under a different constant once a stable version ships.
+export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`;
+const RF_DETR_KEYPOINT_PREVIEW_MODEL =
+  Platform.OS === 'ios'
+    ? RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL
+    : RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL;
+
+/**
+ * @category Models - Pose Estimation
+ * @beta Preview export — may be re-exported under a different constant once a
+ * stable RF-DETR keypoint model ships.
+ */
+export const RF_DETR_KEYPOINT_PREVIEW = {
+  modelName: 'rfdetr-keypoint-preview',
+  modelSource: RF_DETR_KEYPOINT_PREVIEW_MODEL,
+} as const;
+
 // Style transfer
 /**
  * Builds the four `(backend, precision)` URLs for a single style-transfer style.
diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts
index ff2b68b1fd..34ddf45952 100644
--- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts
+++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts
@@ -29,8 +29,20 @@ const YOLO_POSE_CONFIG = {
   defaultKeypointThreshold: 0.5,
 } satisfies PoseEstimationConfig<typeof CocoKeypoint>;
 
+// RF-DETR keypoint preview (BETA). Unlike yolo26n-pose's multi-method
+// `forward_<size>` export, this ships a single `forward` method — omitting
+// availableInputSizes/defaultInputSize makes forward() dispatch to plain
+// `forward`. May be renamed once a stable model ships.
+const RFDETR_KEYPOINT_CONFIG = {
+  keypointMap: CocoKeypoint,
+  preprocessorConfig: undefined,
+  defaultDetectionThreshold: 0.5,
+  defaultKeypointThreshold: 0.5,
+} satisfies PoseEstimationConfig<typeof CocoKeypoint>;
+
 const ModelConfigs = {
   'yolo26n-pose': YOLO_POSE_CONFIG,
+  'rfdetr-keypoint-preview': RFDETR_KEYPOINT_CONFIG,
 } as const satisfies Record<
   PoseEstimationModelName,
   PoseEstimationConfig<LabelEnum>
diff --git a/packages/react-native-executorch/src/types/poseEstimation.ts b/packages/react-native-executorch/src/types/poseEstimation.ts
index 03afc592c3..c7ae352925 100644
--- a/packages/react-native-executorch/src/types/poseEstimation.ts
+++ b/packages/react-native-executorch/src/types/poseEstimation.ts
@@ -62,10 +62,16 @@ export type PoseEstimationConfig<K extends LabelEnum> = {
  * Each model name maps to its required fields.
  * @category Types
  */
-export type PoseEstimationModelSources = {
-  modelName: 'yolo26n-pose';
-  modelSource: ResourceSource;
-};
+export type PoseEstimationModelSources =
+  | {
+      modelName: 'yolo26n-pose';
+      modelSource: ResourceSource;
+    }
+  // RF-DETR keypoint preview (BETA) — may be renamed once a stable model ships.
+  | {
+      modelName: 'rfdetr-keypoint-preview';
+      modelSource: ResourceSource;
+    };
 
 /**
  * Union of all built-in pose estimation model names.

From 04b21610980111c8aa2dd4aef5cb90d768c2c7b8 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 17 Jun 2026 16:26:57 +0200
Subject: [PATCH 2/5] chore: bump version in package.json

---
 packages/react-native-executorch/package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/package.json b/packages/react-native-executorch/package.json
index cfbf1361a8..ab7a8e8e9d 100644
--- a/packages/react-native-executorch/package.json
+++ b/packages/react-native-executorch/package.json
@@ -1,6 +1,6 @@
 {
   "name": "react-native-executorch",
-  "version": "0.9.1",
+  "version": "0.9.2",
   "description": "An easy way to run AI models in React Native with ExecuTorch",
   "source": "./src/index.ts",
   "main": "./lib/module/index.js",

From 6959fe045c6fa2ef92833aad94b794e12a5ddd90 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 17 Jun 2026 16:43:00 +0200
Subject: [PATCH 3/5] chore: replace model url to point to 0.9

---
 packages/react-native-executorch/src/constants/modelUrls.ts | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 8827722b9e..64aca10c6a 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -735,9 +735,9 @@ export const YOLO26N_POSE = {
 // NOTE: served from the `preview/` path under PREVIOUS_VERSION_TAG (shipping as
 // part of a patch release). This export is a preview and may be re-exported
 // under a different constant once a stable version ships.
-export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`;
-export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`;
-export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${PREVIOUS_VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`;
 const RF_DETR_KEYPOINT_PREVIEW_MODEL =
   Platform.OS === 'ios'
     ? RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL

From aea5db0805f6755e94953ef2b3025166548b84f8 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 11 Jun 2026 21:49:06 +0200
Subject: [PATCH 4/5] fix(llm): snapshot vision encoder output before caching
 (#1229)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Description

In any multimodal conversation with more than one image, the model
starts describing earlier images as the most recently sent one on later
turns.

`VisionEncoder::encode` caches the `EValue` returned by
`vision_encoder.execute()` per image path. That tensor aliases the
method's reusable output buffer, so the next `execute()` (the second
image, or any later encode) overwrites the bytes behind every cached
entry. On re-prefilled turns the prefiller then splices the latest
image's embeddings into every image slot. The audio path already
snapshots its encoder output for exactly this reason (see the
`AudioSlot` comment in `multimodal_prefiller.cpp`); vision never got the
same treatment.

The fix copies the encoder output into bytes owned by the cache entry
immediately after `execute()` and serves cache hits from a tensor
wrapping those owned bytes (`unordered_map` nodes are pointer-stable, so
the blob stays valid).

The bug is backend-independent (the cache sits above the delegate), so
XNNPACK/Vulkan multimodal models are affected the same way.

### Introduces a breaking change?

- [ ] Yes
- [x] No

### Type of change

- [x] Bug fix (change which fixes an issue)
- [ ] New feature (change which adds functionality)
- [ ] Documentation update (improves or adds clarity to existing
documentation)
- [ ] Other (chores, tests, code style improvements etc.)

### Tested on

- [x] iOS
- [ ] Android

### Testing instructions

1. Run the example LLM app with a multimodal model (e.g. Gemma 4 E2B
multimodal) on the Multimodal LLM screen.
2. Send image A with "What's in this picture?" — answer is correct.
3. Send image B (different content) with the same question — answer is
correct.
4. Ask "What was in the FIRST picture I sent?".

Before this fix, step 4 describes image B's content (both image slots
receive B's embeddings on the re-prefilled turn). After the fix, the
model correctly recalls image A.

### Screenshots

N/A

### Related issues

N/A

### Checklist

- [x] I have performed a self-review of my code
- [x] I have commented my code, particularly in hard-to-understand areas
- [ ] I have updated the documentation accordingly
- [x] My changes generate no new warnings

### Additional notes

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../common/runner/encoders/vision_encoder.cpp   | 17 +++++++++++++----
 .../common/runner/encoders/vision_encoder.h     | 17 +++++++++++++++--
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
index 59fee53e11..09fb459661 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -112,7 +112,7 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
 
   auto it = embedding_cache_.find(path);
   if (it != embedding_cache_.end()) {
-    return it->second;
+    return EValue(*it->second.tensor);
   }
 
   auto shape = ET_UNWRAP(getInputShape());
@@ -128,9 +128,18 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
       chw.data(), sizes, ::executorch::aten::ScalarType::Float);
 
   auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
-  auto embedding = result[0];
-  embedding_cache_.emplace(path, embedding);
-  return embedding;
+  auto out_tensor = result[0].toTensor();
+
+  CachedEmbedding cached;
+  cached.bytes.resize(out_tensor.nbytes());
+  std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(),
+              out_tensor.nbytes());
+  cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end());
+  cached.dtype = out_tensor.scalar_type();
+  auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached));
+  entry->second.tensor = ::executorch::extension::from_blob(
+      entry->second.bytes.data(), entry->second.sizes, entry->second.dtype);
+  return EValue(*entry->second.tensor);
 }
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
index bb8a8421b9..54d43bb869 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -2,11 +2,14 @@
 #pragma once
 
 #include "iencoder.h"
+#include <cstdint>
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
 #include <runner/multimodal_input.h>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace executorch::extension::llm {
 
@@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder {
     bool with_batch;
   };
 
+  // The method's output EValue aliases the runtime's reusable output buffer,
+  // which the NEXT vision_encoder.execute() overwrites — caching it directly
+  // silently turns earlier images into the most recently encoded one. Cache
+  // an owned byte snapshot instead and hand out a tensor over those bytes.
+  struct CachedEmbedding {
+    std::vector<uint8_t> bytes;
+    std::vector<::executorch::aten::SizesType> sizes;
+    ::executorch::aten::ScalarType dtype;
+    ::executorch::extension::TensorPtr tensor;
+  };
+
   ::executorch::runtime::Result<ImageShape> getInputShape() const;
   std::vector<float> preprocessImage(const std::string &path,
                                      const ImageShape &targetShape) const;
 
   ::executorch::extension::Module *module_;
-  std::unordered_map<std::string, ::executorch::runtime::EValue>
-      embedding_cache_;
+  std::unordered_map<std::string, CachedEmbedding> embedding_cache_;
 };
 
 } // namespace executorch::extension::llm

From 64b68deb229e1c03e0213252bc7c013deadde2d4 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 15 Jun 2026 20:13:32 +0200
Subject: [PATCH 5/5] perf: speed up top-p sampling for large vocabularies
 (#1232)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Description

Optimizes token sampling for large-vocabulary models (e.g. Gemma 4 E2B,
262k vocab), where the previous full-vocabulary sort in top-p dominated
per-token latency.

Two changes in `sampler.cpp`:

- **`mask_topp`**: replaces the `O(n log n)` sort over all logits with a
logit-space histogram (`kBins=2048`) that locates the nucleus threshold
in two `O(n)` passes — no sort, no per-token vocab-sized allocation.
Binning in logit space (rather than probability space) keeps uniform
resolution for both peaked and flat distributions.
- **`softmax`**: skips `exp()` on logits already masked to `lowest()` by
top-k/top-p. The result underflows to zero anyway, and the call is slow
on device.

On an iPhone 17 Pro with Gemma 4 E2B (int4), per-token sampling drops
from ~45 ms to ~10 ms. The histogram approximates the exact sort-based
nucleus; the resulting sampled distribution is statistically equivalent
(verified the kept-mass fraction stays within <1% of the exact nucleus
across peaked, flat, and sharp distributions).

### Introduces a breaking change?

- [ ] Yes
- [x] No

### Type of change

- [ ] Bug fix (change which fixes an issue)
- [ ] New feature (change which adds functionality)
- [ ] Documentation update (improves or adds clarity to existing
documentation)
- [x] Other (chores, tests, code style improvements etc.)

### Tested on

- [x] iOS
- [ ] Android

### Testing instructions

1. Run an LLM with a large vocabulary and a non-zero temperature with
`topP` set (e.g. Gemma 4 E2B with `temperature: 0.8`, `topP: 0.9`).
2. Generate a long response and observe tokens/sec.
3. Confirm output remains coherent and sampling is unchanged in
character (still stochastic, not greedy).

Greedy decoding (`temperature: 0`) is unaffected — it bypasses this path
entirely.

### Screenshots

<!-- N/A -->

### Related issues

<!-- N/A -->

### Checklist

- [x] I have performed a self-review of my code
- [x] I have commented my code, particularly in hard-to-understand areas
- [ ] I have updated the documentation accordingly
- [x] My changes generate no new warnings

### Additional notes

The histogram is an approximation bounded by bin granularity
(`kBins=2048` over a `kRange=40` logit span). This is intentional: exact
top-p over a 262k vocab where the nucleus can exceed 100k tokens is
inherently expensive, and the sampling outcome is statistically
indistinguishable from the exact version.

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../common/runner/sampler.cpp                 | 93 +++++++++++--------
 1 file changed, 52 insertions(+), 41 deletions(-)

diff --git a/packages/react-native-executorch/common/runner/sampler.cpp b/packages/react-native-executorch/common/runner/sampler.cpp
index 250d6a83ef..8215c49be5 100644
--- a/packages/react-native-executorch/common/runner/sampler.cpp
+++ b/packages/react-native-executorch/common/runner/sampler.cpp
@@ -36,6 +36,9 @@
 #include <algorithm>
 #include <ctime>
 #include <limits>
+#include <ranges>
+#include <span>
+#include <type_traits>
 #include <vector>
 
 namespace executorch {
@@ -141,57 +144,59 @@ template <typename T> void Sampler::mask_topk(T *logits) {
   }
 }
 
-// Mask logits whose softmax-prob falls outside the top-p nucleus to -inf.
-// Keeps the token that crosses the threshold (HuggingFace convention).
+// Mask logits outside the top-p nucleus to -inf. Approximates the exact
+// sort-based nucleus with a histogram over (logit - max): two O(n) passes, no
+// sort. Binning in logit (not probability) space keeps uniform resolution for
+// peaked and flat distributions alike. kRange=40 spans exp() down to ~4e-18.
 template <typename T> void Sampler::mask_topp(T *logits) {
   if (topp_ <= 0.0f || topp_ >= 1.0f) {
     return;
   }
-  // Softmax into a scratch probs[] (do not mutate logits yet).
-  T max_val = logits[0];
-  for (size_t i = 1; i < vocab_size_; i++) {
-    if (logits[i] > max_val) {
-      max_val = logits[i];
-    }
-  }
-  std::unique_ptr<ProbIndex<T>[]> probindex =
-      std::make_unique<ProbIndex<T>[]>(vocab_size_);
-  T sum = 0;
+  constexpr int32_t kBins = 2048;
+  // Compute in a type at least as wide as T so converting logits never loses
+  // precision: double stays double, everything else (float and the narrow
+  // half/bf16/uint16 logit types) widens to float. Accumulating in T directly
+  // would be unsafe for bf16, whose mantissa saturates when summing exp()
+  // over the full vocab.
+  using acc_t = std::conditional_t<std::is_same_v<T, double>, double, float>;
+  constexpr acc_t kRange = 40;
+
+  std::span<const T> logit_span{logits, static_cast<size_t>(vocab_size_)};
+  const acc_t max_val =
+      static_cast<acc_t>(*std::ranges::max_element(logit_span));
+
+  std::vector<acc_t> bin_mass(kBins, acc_t(0));
+  acc_t total = 0;
   for (size_t i = 0; i < vocab_size_; i++) {
-    T e = static_cast<T>(std::expf(static_cast<float>(logits[i] - max_val)));
-    probindex[i].prob = e;
-    probindex[i].index = i;
-    sum += e;
+    acc_t d = static_cast<acc_t>(logits[i]) - max_val;
+    acc_t e = std::exp(d);
+    total += e;
+    int32_t bin = static_cast<int32_t>((d + kRange) / kRange * kBins);
+    bin = std::clamp(bin, 0, kBins - 1);
+    bin_mass[bin] += e;
   }
-  if (sum <= T(0)) {
+  if (total <= acc_t(0)) {
     return;
   }
-  for (size_t i = 0; i < vocab_size_; i++) {
-    probindex[i].prob /= sum;
-  }
-  std::sort(probindex.get(), probindex.get() + vocab_size_,
-            [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
-              return a.prob > b.prob;
-            });
 
-  // Find the smallest prefix whose cumulative probability >= topp_.
-  T cumulative = 0;
-  int last_idx = vocab_size_ - 1;
-  for (size_t i = 0; i < vocab_size_; i++) {
-    cumulative += probindex[i].prob;
-    if (static_cast<float>(cumulative) >= topp_) {
-      last_idx = i;
+  // Highest bin downward until the kept mass reaches topp. The crossing bin is
+  // kept (HuggingFace "keep the token that crosses" convention).
+  const acc_t target = static_cast<acc_t>(topp_) * total;
+  acc_t acc = 0;
+  int32_t keep_bin = 0;
+  for (int32_t bin = kBins - 1; bin >= 0; --bin) {
+    acc += bin_mass[bin];
+    if (acc >= target) {
+      keep_bin = bin;
       break;
     }
   }
-  // Mark kept indices, then -inf the rest.
-  std::vector<bool> keep(vocab_size_, false);
-  for (size_t i = 0; i <= last_idx; i++) {
-    keep[probindex[i].index] = true;
-  }
+  const acc_t d_threshold =
+      static_cast<acc_t>(keep_bin) / kBins * kRange - kRange;
+
   constexpr T neg_inf = std::numeric_limits<T>::lowest();
   for (size_t i = 0; i < vocab_size_; i++) {
-    if (!keep[i]) {
+    if (static_cast<acc_t>(logits[i]) - max_val < d_threshold) {
       logits[i] = neg_inf;
     }
   }
@@ -210,22 +215,28 @@ Sampler::Sampler(int32_t vocab_size, GenerationConfig config)
     : Sampler(vocab_size, config, std::time(nullptr)) {}
 
 template <typename T> static void softmax(T *x, int size) {
-  // find max value (for numerical stability)
+  // Runs after top-k/top-p masking, which sets rejected logits to lowest().
+  // Skip exp() on those: it underflows to 0 anyway and is slow on device.
+  constexpr T kMasked = std::numeric_limits<T>::lowest();
   T max_val = x[0];
   for (size_t i = 1; i < size; i++) {
     if (x[i] > max_val) {
       max_val = x[i];
     }
   }
-  // exp and sum
   T sum = 0;
   for (size_t i = 0; i < size; i++) {
+    if (x[i] == kMasked) {
+      x[i] = T(0);
+      continue;
+    }
     x[i] = expf(x[i] - max_val);
     sum += x[i];
   }
-  // normalize
   for (size_t i = 0; i < size; i++) {
-    x[i] /= sum;
+    if (x[i] != T(0)) {
+      x[i] /= sum;
+    }
   }
 }