software-mansion · chmjkb · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/apps/computer-vision/app/pose_estimation/index.tsx b/apps/computer-vision/app/pose_estimation/index.tsx
@@ -1,10 +1,12 @@
 import Spinner from '../../components/Spinner';
 import { BottomBar } from '../../components/BottomBar';
+import { ModelPicker, ModelOption } from '../../components/ModelPicker';
 import { getImage } from '../../utils';
 import {
   models,
   usePoseEstimation,
   PoseDetections,
+  PoseEstimationModelSources,
   RnExecutorchError,
   RnExecutorchErrorCode,
 } from 'react-native-executorch';
@@ -17,6 +19,16 @@ import Svg, { Circle, Line } from 'react-native-svg';
 import ErrorBanner from '../../components/ErrorBanner';
 import { COCO_SKELETON_CONNECTIONS } from '../../components/utils/cocoSkeleton';
 
+const poseEstimation = models.pose_estimation;
+
+const MODELS: ModelOption<PoseEstimationModelSources>[] = [
+  { label: 'YOLO26N Pose', value: poseEstimation.yolo26n() },
+  {
+    label: 'RF-DETR Keypoint (beta)',
+    value: poseEstimation.rfdetr_keypoint_preview(),
+  },
+];
+
 // Colors for different people
 const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink'];
 
@@ -30,8 +42,10 @@ export default function PoseEstimationScreen() {
   }>();
   const [inferenceTime, setInferenceTime] = useState<number | null>(null);
   const [layout, setLayout] = useState({ width: 0, height: 0 });
+  const [selectedModel, setSelectedModel] =
+    useState<PoseEstimationModelSources>(poseEstimation.yolo26n());
 
-  const model = usePoseEstimation({ model: models.pose_estimation.yolo26n() });
+  const model = usePoseEstimation({ model: selectedModel });
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
   useEffect(() => {
@@ -60,7 +74,7 @@ export default function PoseEstimationScreen() {
     if (imageUri) {
       try {
         const start = Date.now();
-        const output = await model.forward(imageUri, { inputSize: 384 });
+        const output = await model.forward(imageUri);
         setInferenceTime(Date.now() - start);
         setResults(output);
       } catch (e) {
@@ -206,6 +220,16 @@ export default function PoseEstimationScreen() {
           </View>
         )}
       </View>
+      <ModelPicker
+        models={MODELS}
+        selectedModel={selectedModel}
+        disabled={model.isGenerating}
+        onSelect={(m) => {
+          setSelectedModel(m);
+          setResults([]);
+          setInferenceTime(null);
+        }}
+      />
       <StatsBar
         inferenceTime={inferenceTime}
         detectionCount={results.length > 0 ? results.length : null}

diff --git a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md
@@ -129,10 +129,15 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).
 
 ## Supported models
 
-| Model                                                                                       | Number of keypoints | Keypoint list                                               | Multi-size Support |
-| ------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ |
-| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17                  | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640)  |
+| Model                                                                                                         | Number of keypoints | Keypoint list                                               | Multi-size Support |
+| ------------------------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ |
+| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose)                   | 17                  | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640)  |
+| [RF-DETR Keypoint (preview)](https://huggingface.co/software-mansion/react-native-executorch-rfdetr-keypoint) | 17                  | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | No                 |
 
 :::tip
 YOLO models support multiple input sizes (384px, 512px, 640px). Smaller sizes are faster but less accurate, while larger sizes are more accurate but slower. Choose based on your speed/accuracy requirements.
 :::
+
+:::warning
+`rfdetr_keypoint_preview` is a **preview weights** export and may be re-exported under a different constant once a stable version ships. It is a single-input-size model (no `inputSize` option) and ships `xnnpack`, `coreml`, and `mlx` backends — pass `{ backend }` to override the platform default, e.g. `models.pose_estimation.rfdetr_keypoint_preview({ backend: 'mlx' })`.
+:::
diff --git a/docs/docs/05-utilities/model-registry.md b/docs/docs/05-utilities/model-registry.md
@@ -26,7 +26,7 @@ Each leaf is a **function**. Call it (optionally with `{ quant, backend }`) to g
 | `classification`        | `efficientnet_v2_s`                                                                                                                                                                                                                                                                         |
 | `privacy_filter`        | `openai`, `nemotron`                                                                                                                                                                                                                                                                        |
 | `object_detection`      | `ssdlite_320_mobilenet_v3_large`, `yolo26n` … `yolo26x`, `rf_detr_nano`                                                                                                                                                                                                                     |
-| `pose_estimation`       | `yolo26n`                                                                                                                                                                                                                                                                                   |
+| `pose_estimation`       | `yolo26n`, `rfdetr_keypoint_preview` _(beta)_                                                                                                                                                                                                                                               |
 | `semantic_segmentation` | `deeplab_v3_resnet50`, `lraspp_mobilenet_v3_large`, `fcn_resnet101`, `selfie_segmentation`, …                                                                                                                                                                                               |
 | `instance_segmentation` | `yolo26n` … `yolo26x`, `rf_detr_nano`, `fastsam_s`, `fastsam_x`                                                                                                                                                                                                                             |
 | `style_transfer`        | `candy`, `mosaic`, `rain_princess`, `udnie`                                                                                                                                                                                                                                                 |

diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -112,7 +112,7 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
 
   auto it = embedding_cache_.find(path);
   if (it != embedding_cache_.end()) {
-    return it->second;
+    return EValue(*it->second.tensor);
   }
 
   auto shape = ET_UNWRAP(getInputShape());
@@ -128,9 +128,18 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
       chw.data(), sizes, ::executorch::aten::ScalarType::Float);
 
   auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
-  auto embedding = result[0];
-  embedding_cache_.emplace(path, embedding);
-  return embedding;
+  auto out_tensor = result[0].toTensor();
+
+  CachedEmbedding cached;
+  cached.bytes.resize(out_tensor.nbytes());
+  std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(),
+              out_tensor.nbytes());
+  cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end());
+  cached.dtype = out_tensor.scalar_type();
+  auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached));
+  entry->second.tensor = ::executorch::extension::from_blob(
+      entry->second.bytes.data(), entry->second.sizes, entry->second.dtype);
+  return EValue(*entry->second.tensor);
 }
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -2,11 +2,14 @@
 #pragma once
 
 #include "iencoder.h"
+#include <cstdint>
 #include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
 #include <runner/multimodal_input.h>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace executorch::extension::llm {
 
@@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder {
     bool with_batch;
   };
 
+  // The method's output EValue aliases the runtime's reusable output buffer,
+  // which the NEXT vision_encoder.execute() overwrites — caching it directly
+  // silently turns earlier images into the most recently encoded one. Cache
+  // an owned byte snapshot instead and hand out a tensor over those bytes.
+  struct CachedEmbedding {
+    std::vector<uint8_t> bytes;
+    std::vector<::executorch::aten::SizesType> sizes;
+    ::executorch::aten::ScalarType dtype;
+    ::executorch::extension::TensorPtr tensor;
+  };
+
   ::executorch::runtime::Result<ImageShape> getInputShape() const;
   std::vector<float> preprocessImage(const std::string &path,
                                      const ImageShape &targetShape) const;
 
   ::executorch::extension::Module *module_;
-  std::unordered_map<std::string, ::executorch::runtime::EValue>
-      embedding_cache_;
+  std::unordered_map<std::string, CachedEmbedding> embedding_cache_;
 };
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/sampler.cpp b/packages/react-native-executorch/common/runner/sampler.cpp
@@ -36,6 +36,9 @@
 #include <algorithm>
 #include <ctime>
 #include <limits>
+#include <ranges>
+#include <span>
+#include <type_traits>
 #include <vector>
 
 namespace executorch {
@@ -141,57 +144,59 @@ template <typename T> void Sampler::mask_topk(T *logits) {
   }
 }
 
-// Mask logits whose softmax-prob falls outside the top-p nucleus to -inf.
-// Keeps the token that crosses the threshold (HuggingFace convention).
+// Mask logits outside the top-p nucleus to -inf. Approximates the exact
+// sort-based nucleus with a histogram over (logit - max): two O(n) passes, no
+// sort. Binning in logit (not probability) space keeps uniform resolution for
+// peaked and flat distributions alike. kRange=40 spans exp() down to ~4e-18.
 template <typename T> void Sampler::mask_topp(T *logits) {
   if (topp_ <= 0.0f || topp_ >= 1.0f) {
     return;
   }
-  // Softmax into a scratch probs[] (do not mutate logits yet).
-  T max_val = logits[0];
-  for (size_t i = 1; i < vocab_size_; i++) {
-    if (logits[i] > max_val) {
-      max_val = logits[i];
-    }
-  }
-  std::unique_ptr<ProbIndex<T>[]> probindex =
-      std::make_unique<ProbIndex<T>[]>(vocab_size_);
-  T sum = 0;
+  constexpr int32_t kBins = 2048;
+  // Compute in a type at least as wide as T so converting logits never loses
+  // precision: double stays double, everything else (float and the narrow
+  // half/bf16/uint16 logit types) widens to float. Accumulating in T directly
+  // would be unsafe for bf16, whose mantissa saturates when summing exp()
+  // over the full vocab.
+  using acc_t = std::conditional_t<std::is_same_v<T, double>, double, float>;
+  constexpr acc_t kRange = 40;
+
+  std::span<const T> logit_span{logits, static_cast<size_t>(vocab_size_)};
+  const acc_t max_val =
+      static_cast<acc_t>(*std::ranges::max_element(logit_span));
+
+  std::vector<acc_t> bin_mass(kBins, acc_t(0));
+  acc_t total = 0;
   for (size_t i = 0; i < vocab_size_; i++) {
-    T e = static_cast<T>(std::expf(static_cast<float>(logits[i] - max_val)));
-    probindex[i].prob = e;
-    probindex[i].index = i;
-    sum += e;
+    acc_t d = static_cast<acc_t>(logits[i]) - max_val;
+    acc_t e = std::exp(d);
+    total += e;
+    int32_t bin = static_cast<int32_t>((d + kRange) / kRange * kBins);
+    bin = std::clamp(bin, 0, kBins - 1);
+    bin_mass[bin] += e;
   }
-  if (sum <= T(0)) {
+  if (total <= acc_t(0)) {
     return;
   }
-  for (size_t i = 0; i < vocab_size_; i++) {
-    probindex[i].prob /= sum;
-  }
-  std::sort(probindex.get(), probindex.get() + vocab_size_,
-            [](const ProbIndex<T> &a, const ProbIndex<T> &b) {
-              return a.prob > b.prob;
-            });
 
-  // Find the smallest prefix whose cumulative probability >= topp_.
-  T cumulative = 0;
-  int last_idx = vocab_size_ - 1;
-  for (size_t i = 0; i < vocab_size_; i++) {
-    cumulative += probindex[i].prob;
-    if (static_cast<float>(cumulative) >= topp_) {
-      last_idx = i;
+  // Highest bin downward until the kept mass reaches topp. The crossing bin is
+  // kept (HuggingFace "keep the token that crosses" convention).
+  const acc_t target = static_cast<acc_t>(topp_) * total;
+  acc_t acc = 0;
+  int32_t keep_bin = 0;
+  for (int32_t bin = kBins - 1; bin >= 0; --bin) {
+    acc += bin_mass[bin];
+    if (acc >= target) {
+      keep_bin = bin;
       break;
     }
   }
-  // Mark kept indices, then -inf the rest.
-  std::vector<bool> keep(vocab_size_, false);
-  for (size_t i = 0; i <= last_idx; i++) {
-    keep[probindex[i].index] = true;
-  }
+  const acc_t d_threshold =
+      static_cast<acc_t>(keep_bin) / kBins * kRange - kRange;
+
   constexpr T neg_inf = std::numeric_limits<T>::lowest();
   for (size_t i = 0; i < vocab_size_; i++) {
-    if (!keep[i]) {
+    if (static_cast<acc_t>(logits[i]) - max_val < d_threshold) {
       logits[i] = neg_inf;
     }
   }
@@ -210,22 +215,28 @@ Sampler::Sampler(int32_t vocab_size, GenerationConfig config)
     : Sampler(vocab_size, config, std::time(nullptr)) {}
 
 template <typename T> static void softmax(T *x, int size) {
-  // find max value (for numerical stability)
+  // Runs after top-k/top-p masking, which sets rejected logits to lowest().
+  // Skip exp() on those: it underflows to 0 anyway and is slow on device.
+  constexpr T kMasked = std::numeric_limits<T>::lowest();
   T max_val = x[0];
   for (size_t i = 1; i < size; i++) {
     if (x[i] > max_val) {
       max_val = x[i];
     }
   }
-  // exp and sum
   T sum = 0;
   for (size_t i = 0; i < size; i++) {
+    if (x[i] == kMasked) {
+      x[i] = T(0);
+      continue;
+    }
     x[i] = expf(x[i] - max_val);
     sum += x[i];
   }
-  // normalize
   for (size_t i = 0; i < size; i++) {
-    x[i] /= sum;
+    if (x[i] != T(0)) {
+      x[i] /= sum;
+    }
   }
 }
 

diff --git a/packages/react-native-executorch/package.json b/packages/react-native-executorch/package.json
@@ -1,6 +1,6 @@
 {
   "name": "react-native-executorch",
-  "version": "0.9.1",
+  "version": "0.9.2",
   "description": "An easy way to run AI models in React Native with ExecuTorch",
   "source": "./src/index.ts",
   "main": "./lib/module/index.js",

diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -276,6 +276,31 @@ const RF_DETR_NANO_SEG_VARIANTS = {
   },
 };
 
+// RF-DETR Keypoint (pose estimation) — BETA preview. Configs mirror the
+// All three backends ship fp32
+// (non-quantized); this entry may be re-exported under a different constant
+// once more RF-DETR keypoint weights are released.
+const RF_DETR_KEYPOINT_PREVIEW_VARIANTS = {
+  xnnpack: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL,
+    },
+  },
+  coreml: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL,
+    },
+  },
+  mlx: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL,
+    },
+  },
+};
+
 const FASTSAM_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -553,6 +578,9 @@ export const models = {
   },
   pose_estimation: {
     yolo26n: base(M.YOLO26N_POSE),
+    // BETA preview — may be re-exported under a different constant once a
+    // stable RF-DETR keypoint model ships.
+    rfdetr_keypoint_preview: variant(RF_DETR_KEYPOINT_PREVIEW_VARIANTS),
   },
   semantic_segmentation: {
     deeplab_v3_resnet50: pair(

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -731,6 +731,28 @@ export const YOLO26N_POSE = {
   modelSource: YOLO26N_POSE_MODEL,
 } as const;
 
+// RF-DETR Keypoint (pose estimation) — BETA preview.
+// NOTE: served from the `preview/` path under PREVIOUS_VERSION_TAG (shipping as
+// part of a patch release). This export is a preview and may be re-exported
+// under a different constant once a stable version ships.
+export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`;
+const RF_DETR_KEYPOINT_PREVIEW_MODEL =
+  Platform.OS === 'ios'
+    ? RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL
+    : RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL;
+
+/**
+ * @category Models - Pose Estimation
+ * @beta Preview export — may be re-exported under a different constant once a
+ * stable RF-DETR keypoint model ships.
+ */
+export const RF_DETR_KEYPOINT_PREVIEW = {
+  modelName: 'rfdetr-keypoint-preview',
+  modelSource: RF_DETR_KEYPOINT_PREVIEW_MODEL,
+} as const;
+
 // Style transfer
 /**
  * Builds the four `(backend, precision)` URLs for a single style-transfer style.