Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions apps/computer-vision/app/pose_estimation/index.tsx
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import Spinner from '../../components/Spinner';
import { BottomBar } from '../../components/BottomBar';
import { ModelPicker, ModelOption } from '../../components/ModelPicker';
import { getImage } from '../../utils';
import {
models,
usePoseEstimation,
PoseDetections,
PoseEstimationModelSources,
RnExecutorchError,
RnExecutorchErrorCode,
} from 'react-native-executorch';
Expand All @@ -17,6 +19,16 @@ import Svg, { Circle, Line } from 'react-native-svg';
import ErrorBanner from '../../components/ErrorBanner';
import { COCO_SKELETON_CONNECTIONS } from '../../components/utils/cocoSkeleton';

const poseEstimation = models.pose_estimation;

const MODELS: ModelOption<PoseEstimationModelSources>[] = [
{ label: 'YOLO26N Pose', value: poseEstimation.yolo26n() },
{
label: 'RF-DETR Keypoint (beta)',
value: poseEstimation.rfdetr_keypoint_preview(),
},
];

// Colors for different people
const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink'];

Expand All @@ -30,8 +42,10 @@ export default function PoseEstimationScreen() {
}>();
const [inferenceTime, setInferenceTime] = useState<number | null>(null);
const [layout, setLayout] = useState({ width: 0, height: 0 });
const [selectedModel, setSelectedModel] =
useState<PoseEstimationModelSources>(poseEstimation.yolo26n());

const model = usePoseEstimation({ model: models.pose_estimation.yolo26n() });
const model = usePoseEstimation({ model: selectedModel });
const { setGlobalGenerating } = useContext(GeneratingContext);

useEffect(() => {
Expand Down Expand Up @@ -60,7 +74,7 @@ export default function PoseEstimationScreen() {
if (imageUri) {
try {
const start = Date.now();
const output = await model.forward(imageUri, { inputSize: 384 });
const output = await model.forward(imageUri);
setInferenceTime(Date.now() - start);
setResults(output);
} catch (e) {
Expand Down Expand Up @@ -206,6 +220,16 @@ export default function PoseEstimationScreen() {
</View>
)}
</View>
<ModelPicker
models={MODELS}
selectedModel={selectedModel}
disabled={model.isGenerating}
onSelect={(m) => {
setSelectedModel(m);
setResults([]);
setInferenceTime(null);
}}
/>
<StatsBar
inferenceTime={inferenceTime}
detectionCount={results.length > 0 ? results.length : null}
Expand Down
11 changes: 8 additions & 3 deletions docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,15 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md).

## Supported models

| Model | Number of keypoints | Keypoint list | Multi-size Support |
| ------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ |
| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) |
| Model | Number of keypoints | Keypoint list | Multi-size Support |
| ------------------------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ |
| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) |
| [RF-DETR Keypoint (preview)](https://huggingface.co/software-mansion/react-native-executorch-rfdetr-keypoint) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | No |

:::tip
YOLO models support multiple input sizes (384px, 512px, 640px). Smaller sizes are faster but less accurate, while larger sizes are more accurate but slower. Choose based on your speed/accuracy requirements.
:::

:::warning
`rfdetr_keypoint_preview` is a **preview weights** export and may be re-exported under a different constant once a stable version ships. It is a single-input-size model (no `inputSize` option) and ships `xnnpack`, `coreml`, and `mlx` backends — pass `{ backend }` to override the platform default, e.g. `models.pose_estimation.rfdetr_keypoint_preview({ backend: 'mlx' })`.
:::
2 changes: 1 addition & 1 deletion docs/docs/05-utilities/model-registry.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Each leaf is a **function**. Call it (optionally with `{ quant, backend }`) to g
| `classification` | `efficientnet_v2_s` |
| `privacy_filter` | `openai`, `nemotron` |
| `object_detection` | `ssdlite_320_mobilenet_v3_large`, `yolo26n` … `yolo26x`, `rf_detr_nano` |
| `pose_estimation` | `yolo26n` |
| `pose_estimation` | `yolo26n`, `rfdetr_keypoint_preview` _(beta)_ |
| `semantic_segmentation` | `deeplab_v3_resnet50`, `lraspp_mobilenet_v3_large`, `fcn_resnet101`, `selfie_segmentation`, … |
| `instance_segmentation` | `yolo26n` … `yolo26x`, `rf_detr_nano`, `fastsam_s`, `fastsam_x` |
| `style_transfer` | `candy`, `mosaic`, `rain_princess`, `udnie` |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {

auto it = embedding_cache_.find(path);
if (it != embedding_cache_.end()) {
return it->second;
return EValue(*it->second.tensor);
}

auto shape = ET_UNWRAP(getInputShape());
Expand All @@ -128,9 +128,18 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
chw.data(), sizes, ::executorch::aten::ScalarType::Float);

auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
auto embedding = result[0];
embedding_cache_.emplace(path, embedding);
return embedding;
auto out_tensor = result[0].toTensor();

CachedEmbedding cached;
cached.bytes.resize(out_tensor.nbytes());
std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(),
out_tensor.nbytes());
cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end());
cached.dtype = out_tensor.scalar_type();
auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached));
entry->second.tensor = ::executorch::extension::from_blob(
entry->second.bytes.data(), entry->second.sizes, entry->second.dtype);
return EValue(*entry->second.tensor);
}

} // namespace executorch::extension::llm
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
#pragma once

#include "iencoder.h"
#include <cstdint>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/core/evalue.h>
#include <runner/multimodal_input.h>
#include <string>
#include <unordered_map>
#include <vector>

namespace executorch::extension::llm {

Expand All @@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder {
bool with_batch;
};

// The method's output EValue aliases the runtime's reusable output buffer,
// which the NEXT vision_encoder.execute() overwrites — caching it directly
// silently turns earlier images into the most recently encoded one. Cache
// an owned byte snapshot instead and hand out a tensor over those bytes.
struct CachedEmbedding {
std::vector<uint8_t> bytes;
std::vector<::executorch::aten::SizesType> sizes;
::executorch::aten::ScalarType dtype;
::executorch::extension::TensorPtr tensor;
};

::executorch::runtime::Result<ImageShape> getInputShape() const;
std::vector<float> preprocessImage(const std::string &path,
const ImageShape &targetShape) const;

::executorch::extension::Module *module_;
std::unordered_map<std::string, ::executorch::runtime::EValue>
embedding_cache_;
std::unordered_map<std::string, CachedEmbedding> embedding_cache_;
};

} // namespace executorch::extension::llm
93 changes: 52 additions & 41 deletions packages/react-native-executorch/common/runner/sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
#include <algorithm>
#include <ctime>
#include <limits>
#include <ranges>
#include <span>
#include <type_traits>
#include <vector>

namespace executorch {
Expand Down Expand Up @@ -141,57 +144,59 @@ template <typename T> void Sampler::mask_topk(T *logits) {
}
}

// Mask logits whose softmax-prob falls outside the top-p nucleus to -inf.
// Keeps the token that crosses the threshold (HuggingFace convention).
// Mask logits outside the top-p nucleus to -inf. Approximates the exact
// sort-based nucleus with a histogram over (logit - max): two O(n) passes, no
// sort. Binning in logit (not probability) space keeps uniform resolution for
// peaked and flat distributions alike. kRange=40 spans exp() down to ~4e-18.
template <typename T> void Sampler::mask_topp(T *logits) {
if (topp_ <= 0.0f || topp_ >= 1.0f) {
return;
}
// Softmax into a scratch probs[] (do not mutate logits yet).
T max_val = logits[0];
for (size_t i = 1; i < vocab_size_; i++) {
if (logits[i] > max_val) {
max_val = logits[i];
}
}
std::unique_ptr<ProbIndex<T>[]> probindex =
std::make_unique<ProbIndex<T>[]>(vocab_size_);
T sum = 0;
constexpr int32_t kBins = 2048;
// Compute in a type at least as wide as T so converting logits never loses
// precision: double stays double, everything else (float and the narrow
// half/bf16/uint16 logit types) widens to float. Accumulating in T directly
// would be unsafe for bf16, whose mantissa saturates when summing exp()
// over the full vocab.
using acc_t = std::conditional_t<std::is_same_v<T, double>, double, float>;
constexpr acc_t kRange = 40;

std::span<const T> logit_span{logits, static_cast<size_t>(vocab_size_)};
const acc_t max_val =
static_cast<acc_t>(*std::ranges::max_element(logit_span));

std::vector<acc_t> bin_mass(kBins, acc_t(0));
acc_t total = 0;
for (size_t i = 0; i < vocab_size_; i++) {
T e = static_cast<T>(std::expf(static_cast<float>(logits[i] - max_val)));
probindex[i].prob = e;
probindex[i].index = i;
sum += e;
acc_t d = static_cast<acc_t>(logits[i]) - max_val;
acc_t e = std::exp(d);
total += e;
int32_t bin = static_cast<int32_t>((d + kRange) / kRange * kBins);
bin = std::clamp(bin, 0, kBins - 1);
bin_mass[bin] += e;
}
if (sum <= T(0)) {
if (total <= acc_t(0)) {
return;
}
for (size_t i = 0; i < vocab_size_; i++) {
probindex[i].prob /= sum;
}
std::sort(probindex.get(), probindex.get() + vocab_size_,
[](const ProbIndex<T> &a, const ProbIndex<T> &b) {
return a.prob > b.prob;
});

// Find the smallest prefix whose cumulative probability >= topp_.
T cumulative = 0;
int last_idx = vocab_size_ - 1;
for (size_t i = 0; i < vocab_size_; i++) {
cumulative += probindex[i].prob;
if (static_cast<float>(cumulative) >= topp_) {
last_idx = i;
// Highest bin downward until the kept mass reaches topp. The crossing bin is
// kept (HuggingFace "keep the token that crosses" convention).
const acc_t target = static_cast<acc_t>(topp_) * total;
acc_t acc = 0;
int32_t keep_bin = 0;
for (int32_t bin = kBins - 1; bin >= 0; --bin) {
acc += bin_mass[bin];
if (acc >= target) {
keep_bin = bin;
break;
}
}
// Mark kept indices, then -inf the rest.
std::vector<bool> keep(vocab_size_, false);
for (size_t i = 0; i <= last_idx; i++) {
keep[probindex[i].index] = true;
}
const acc_t d_threshold =
static_cast<acc_t>(keep_bin) / kBins * kRange - kRange;

constexpr T neg_inf = std::numeric_limits<T>::lowest();
for (size_t i = 0; i < vocab_size_; i++) {
if (!keep[i]) {
if (static_cast<acc_t>(logits[i]) - max_val < d_threshold) {
logits[i] = neg_inf;
}
}
Expand All @@ -210,22 +215,28 @@ Sampler::Sampler(int32_t vocab_size, GenerationConfig config)
: Sampler(vocab_size, config, std::time(nullptr)) {}

template <typename T> static void softmax(T *x, int size) {
// find max value (for numerical stability)
// Runs after top-k/top-p masking, which sets rejected logits to lowest().
// Skip exp() on those: it underflows to 0 anyway and is slow on device.
constexpr T kMasked = std::numeric_limits<T>::lowest();
T max_val = x[0];
for (size_t i = 1; i < size; i++) {
if (x[i] > max_val) {
max_val = x[i];
}
}
// exp and sum
T sum = 0;
for (size_t i = 0; i < size; i++) {
if (x[i] == kMasked) {
x[i] = T(0);
continue;
}
x[i] = expf(x[i] - max_val);
sum += x[i];
}
// normalize
for (size_t i = 0; i < size; i++) {
x[i] /= sum;
if (x[i] != T(0)) {
x[i] /= sum;
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion packages/react-native-executorch/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "react-native-executorch",
"version": "0.9.1",
"version": "0.9.2",
"description": "An easy way to run AI models in React Native with ExecuTorch",
"source": "./src/index.ts",
"main": "./lib/module/index.js",
Expand Down
28 changes: 28 additions & 0 deletions packages/react-native-executorch/src/constants/modelRegistry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,31 @@ const RF_DETR_NANO_SEG_VARIANTS = {
},
};

// RF-DETR Keypoint (pose estimation) — BETA preview. Configs mirror the
// All three backends ship fp32
// (non-quantized); this entry may be re-exported under a different constant
// once more RF-DETR keypoint weights are released.
const RF_DETR_KEYPOINT_PREVIEW_VARIANTS = {
xnnpack: {
base: {
modelName: 'rfdetr-keypoint-preview' as const,
modelSource: M.RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL,
},
},
coreml: {
base: {
modelName: 'rfdetr-keypoint-preview' as const,
modelSource: M.RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL,
},
},
mlx: {
base: {
modelName: 'rfdetr-keypoint-preview' as const,
modelSource: M.RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL,
},
},
};

const FASTSAM_S_VARIANTS = {
xnnpack: {
base: {
Expand Down Expand Up @@ -553,6 +578,9 @@ export const models = {
},
pose_estimation: {
yolo26n: base(M.YOLO26N_POSE),
// BETA preview — may be re-exported under a different constant once a
// stable RF-DETR keypoint model ships.
rfdetr_keypoint_preview: variant(RF_DETR_KEYPOINT_PREVIEW_VARIANTS),
},
semantic_segmentation: {
deeplab_v3_resnet50: pair(
Expand Down
22 changes: 22 additions & 0 deletions packages/react-native-executorch/src/constants/modelUrls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,28 @@ export const YOLO26N_POSE = {
modelSource: YOLO26N_POSE_MODEL,
} as const;

// RF-DETR Keypoint (pose estimation) — BETA preview.
// NOTE: served from the `preview/` path under PREVIOUS_VERSION_TAG (shipping as
// part of a patch release). This export is a preview and may be re-exported
// under a different constant once a stable version ships.
export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`;
export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`;
export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`;
const RF_DETR_KEYPOINT_PREVIEW_MODEL =
Platform.OS === 'ios'
? RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL
: RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL;

/**
* @category Models - Pose Estimation
* @beta Preview export — may be re-exported under a different constant once a
* stable RF-DETR keypoint model ships.
*/
export const RF_DETR_KEYPOINT_PREVIEW = {
modelName: 'rfdetr-keypoint-preview',
modelSource: RF_DETR_KEYPOINT_PREVIEW_MODEL,
} as const;

// Style transfer
/**
* Builds the four `(backend, precision)` URLs for a single style-transfer style.
Expand Down
Loading
Loading