diff --git a/apps/computer-vision/app/pose_estimation/index.tsx b/apps/computer-vision/app/pose_estimation/index.tsx index 4546b628fa..8e16693f6a 100644 --- a/apps/computer-vision/app/pose_estimation/index.tsx +++ b/apps/computer-vision/app/pose_estimation/index.tsx @@ -1,10 +1,12 @@ import Spinner from '../../components/Spinner'; import { BottomBar } from '../../components/BottomBar'; +import { ModelPicker, ModelOption } from '../../components/ModelPicker'; import { getImage } from '../../utils'; import { models, usePoseEstimation, PoseDetections, + PoseEstimationModelSources, RnExecutorchError, RnExecutorchErrorCode, } from 'react-native-executorch'; @@ -17,6 +19,16 @@ import Svg, { Circle, Line } from 'react-native-svg'; import ErrorBanner from '../../components/ErrorBanner'; import { COCO_SKELETON_CONNECTIONS } from '../../components/utils/cocoSkeleton'; +const poseEstimation = models.pose_estimation; + +const MODELS: ModelOption[] = [ + { label: 'YOLO26N Pose', value: poseEstimation.yolo26n() }, + { + label: 'RF-DETR Keypoint (beta)', + value: poseEstimation.rfdetr_keypoint_preview(), + }, +]; + // Colors for different people const PERSON_COLORS = ['lime', 'cyan', 'magenta', 'yellow', 'orange', 'pink']; @@ -30,8 +42,10 @@ export default function PoseEstimationScreen() { }>(); const [inferenceTime, setInferenceTime] = useState(null); const [layout, setLayout] = useState({ width: 0, height: 0 }); + const [selectedModel, setSelectedModel] = + useState(poseEstimation.yolo26n()); - const model = usePoseEstimation({ model: models.pose_estimation.yolo26n() }); + const model = usePoseEstimation({ model: selectedModel }); const { setGlobalGenerating } = useContext(GeneratingContext); useEffect(() => { @@ -60,7 +74,7 @@ export default function PoseEstimationScreen() { if (imageUri) { try { const start = Date.now(); - const output = await model.forward(imageUri, { inputSize: 384 }); + const output = await model.forward(imageUri); setInferenceTime(Date.now() - start); setResults(output); } catch (e) { @@ -206,6 +220,16 @@ export default function PoseEstimationScreen() { )} + { + setSelectedModel(m); + setResults([]); + setInferenceTime(null); + }} + /> 0 ? results.length : null} diff --git a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md index 465a0ab6a6..eb3d7a5368 100644 --- a/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md +++ b/docs/docs/03-hooks/02-computer-vision/usePoseEstimation.md @@ -129,10 +129,15 @@ See the full guide: [VisionCamera Integration](./visioncamera-integration.md). ## Supported models -| Model | Number of keypoints | Keypoint list | Multi-size Support | -| ------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ | -| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) | +| Model | Number of keypoints | Keypoint list | Multi-size Support | +| ------------------------------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------- | ------------------ | +| [YOLO26N-Pose](https://huggingface.co/software-mansion/react-native-executorch-yolo26-pose) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | Yes (384/512/640) | +| [RF-DETR Keypoint (preview)](https://huggingface.co/software-mansion/react-native-executorch-rfdetr-keypoint) | 17 | [COCO](../../06-api-reference/enumerations/CocoKeypoint.md) | No | :::tip YOLO models support multiple input sizes (384px, 512px, 640px). Smaller sizes are faster but less accurate, while larger sizes are more accurate but slower. Choose based on your speed/accuracy requirements. ::: + +:::warning +`rfdetr_keypoint_preview` is a **preview weights** export and may be re-exported under a different constant once a stable version ships. It is a single-input-size model (no `inputSize` option) and ships `xnnpack`, `coreml`, and `mlx` backends — pass `{ backend }` to override the platform default, e.g. `models.pose_estimation.rfdetr_keypoint_preview({ backend: 'mlx' })`. +::: diff --git a/docs/docs/05-utilities/model-registry.md b/docs/docs/05-utilities/model-registry.md index 3611731235..1f135ea43c 100644 --- a/docs/docs/05-utilities/model-registry.md +++ b/docs/docs/05-utilities/model-registry.md @@ -26,7 +26,7 @@ Each leaf is a **function**. Call it (optionally with `{ quant, backend }`) to g | `classification` | `efficientnet_v2_s` | | `privacy_filter` | `openai`, `nemotron` | | `object_detection` | `ssdlite_320_mobilenet_v3_large`, `yolo26n` … `yolo26x`, `rf_detr_nano` | -| `pose_estimation` | `yolo26n` | +| `pose_estimation` | `yolo26n`, `rfdetr_keypoint_preview` _(beta)_ | | `semantic_segmentation` | `deeplab_v3_resnet50`, `lraspp_mobilenet_v3_large`, `fcn_resnet101`, `selfie_segmentation`, … | | `instance_segmentation` | `yolo26n` … `yolo26x`, `rf_detr_nano`, `fastsam_s`, `fastsam_x` | | `style_transfer` | `candy`, `mosaic`, `rain_princess`, `udnie` | diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 59fee53e11..09fb459661 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -112,7 +112,7 @@ Result VisionEncoder::encode(const MultimodalInput &input) { auto it = embedding_cache_.find(path); if (it != embedding_cache_.end()) { - return it->second; + return EValue(*it->second.tensor); } auto shape = ET_UNWRAP(getInputShape()); @@ -128,9 +128,18 @@ Result VisionEncoder::encode(const MultimodalInput &input) { chw.data(), sizes, ::executorch::aten::ScalarType::Float); auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); - auto embedding = result[0]; - embedding_cache_.emplace(path, embedding); - return embedding; + auto out_tensor = result[0].toTensor(); + + CachedEmbedding cached; + cached.bytes.resize(out_tensor.nbytes()); + std::memcpy(cached.bytes.data(), out_tensor.const_data_ptr(), + out_tensor.nbytes()); + cached.sizes.assign(out_tensor.sizes().begin(), out_tensor.sizes().end()); + cached.dtype = out_tensor.scalar_type(); + auto [entry, inserted] = embedding_cache_.emplace(path, std::move(cached)); + entry->second.tensor = ::executorch::extension::from_blob( + entry->second.bytes.data(), entry->second.sizes, entry->second.dtype); + return EValue(*entry->second.tensor); } } // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h index bb8a8421b9..54d43bb869 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -2,11 +2,14 @@ #pragma once #include "iencoder.h" +#include #include +#include #include #include #include #include +#include namespace executorch::extension::llm { @@ -26,13 +29,23 @@ class VisionEncoder : public IEncoder { bool with_batch; }; + // The method's output EValue aliases the runtime's reusable output buffer, + // which the NEXT vision_encoder.execute() overwrites — caching it directly + // silently turns earlier images into the most recently encoded one. Cache + // an owned byte snapshot instead and hand out a tensor over those bytes. + struct CachedEmbedding { + std::vector bytes; + std::vector<::executorch::aten::SizesType> sizes; + ::executorch::aten::ScalarType dtype; + ::executorch::extension::TensorPtr tensor; + }; + ::executorch::runtime::Result getInputShape() const; std::vector preprocessImage(const std::string &path, const ImageShape &targetShape) const; ::executorch::extension::Module *module_; - std::unordered_map - embedding_cache_; + std::unordered_map embedding_cache_; }; } // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/sampler.cpp b/packages/react-native-executorch/common/runner/sampler.cpp index 250d6a83ef..8215c49be5 100644 --- a/packages/react-native-executorch/common/runner/sampler.cpp +++ b/packages/react-native-executorch/common/runner/sampler.cpp @@ -36,6 +36,9 @@ #include #include #include +#include +#include +#include #include namespace executorch { @@ -141,57 +144,59 @@ template void Sampler::mask_topk(T *logits) { } } -// Mask logits whose softmax-prob falls outside the top-p nucleus to -inf. -// Keeps the token that crosses the threshold (HuggingFace convention). +// Mask logits outside the top-p nucleus to -inf. Approximates the exact +// sort-based nucleus with a histogram over (logit - max): two O(n) passes, no +// sort. Binning in logit (not probability) space keeps uniform resolution for +// peaked and flat distributions alike. kRange=40 spans exp() down to ~4e-18. template void Sampler::mask_topp(T *logits) { if (topp_ <= 0.0f || topp_ >= 1.0f) { return; } - // Softmax into a scratch probs[] (do not mutate logits yet). - T max_val = logits[0]; - for (size_t i = 1; i < vocab_size_; i++) { - if (logits[i] > max_val) { - max_val = logits[i]; - } - } - std::unique_ptr[]> probindex = - std::make_unique[]>(vocab_size_); - T sum = 0; + constexpr int32_t kBins = 2048; + // Compute in a type at least as wide as T so converting logits never loses + // precision: double stays double, everything else (float and the narrow + // half/bf16/uint16 logit types) widens to float. Accumulating in T directly + // would be unsafe for bf16, whose mantissa saturates when summing exp() + // over the full vocab. + using acc_t = std::conditional_t, double, float>; + constexpr acc_t kRange = 40; + + std::span logit_span{logits, static_cast(vocab_size_)}; + const acc_t max_val = + static_cast(*std::ranges::max_element(logit_span)); + + std::vector bin_mass(kBins, acc_t(0)); + acc_t total = 0; for (size_t i = 0; i < vocab_size_; i++) { - T e = static_cast(std::expf(static_cast(logits[i] - max_val))); - probindex[i].prob = e; - probindex[i].index = i; - sum += e; + acc_t d = static_cast(logits[i]) - max_val; + acc_t e = std::exp(d); + total += e; + int32_t bin = static_cast((d + kRange) / kRange * kBins); + bin = std::clamp(bin, 0, kBins - 1); + bin_mass[bin] += e; } - if (sum <= T(0)) { + if (total <= acc_t(0)) { return; } - for (size_t i = 0; i < vocab_size_; i++) { - probindex[i].prob /= sum; - } - std::sort(probindex.get(), probindex.get() + vocab_size_, - [](const ProbIndex &a, const ProbIndex &b) { - return a.prob > b.prob; - }); - // Find the smallest prefix whose cumulative probability >= topp_. - T cumulative = 0; - int last_idx = vocab_size_ - 1; - for (size_t i = 0; i < vocab_size_; i++) { - cumulative += probindex[i].prob; - if (static_cast(cumulative) >= topp_) { - last_idx = i; + // Highest bin downward until the kept mass reaches topp. The crossing bin is + // kept (HuggingFace "keep the token that crosses" convention). + const acc_t target = static_cast(topp_) * total; + acc_t acc = 0; + int32_t keep_bin = 0; + for (int32_t bin = kBins - 1; bin >= 0; --bin) { + acc += bin_mass[bin]; + if (acc >= target) { + keep_bin = bin; break; } } - // Mark kept indices, then -inf the rest. - std::vector keep(vocab_size_, false); - for (size_t i = 0; i <= last_idx; i++) { - keep[probindex[i].index] = true; - } + const acc_t d_threshold = + static_cast(keep_bin) / kBins * kRange - kRange; + constexpr T neg_inf = std::numeric_limits::lowest(); for (size_t i = 0; i < vocab_size_; i++) { - if (!keep[i]) { + if (static_cast(logits[i]) - max_val < d_threshold) { logits[i] = neg_inf; } } @@ -210,22 +215,28 @@ Sampler::Sampler(int32_t vocab_size, GenerationConfig config) : Sampler(vocab_size, config, std::time(nullptr)) {} template static void softmax(T *x, int size) { - // find max value (for numerical stability) + // Runs after top-k/top-p masking, which sets rejected logits to lowest(). + // Skip exp() on those: it underflows to 0 anyway and is slow on device. + constexpr T kMasked = std::numeric_limits::lowest(); T max_val = x[0]; for (size_t i = 1; i < size; i++) { if (x[i] > max_val) { max_val = x[i]; } } - // exp and sum T sum = 0; for (size_t i = 0; i < size; i++) { + if (x[i] == kMasked) { + x[i] = T(0); + continue; + } x[i] = expf(x[i] - max_val); sum += x[i]; } - // normalize for (size_t i = 0; i < size; i++) { - x[i] /= sum; + if (x[i] != T(0)) { + x[i] /= sum; + } } } diff --git a/packages/react-native-executorch/package.json b/packages/react-native-executorch/package.json index cfbf1361a8..ab7a8e8e9d 100644 --- a/packages/react-native-executorch/package.json +++ b/packages/react-native-executorch/package.json @@ -1,6 +1,6 @@ { "name": "react-native-executorch", - "version": "0.9.1", + "version": "0.9.2", "description": "An easy way to run AI models in React Native with ExecuTorch", "source": "./src/index.ts", "main": "./lib/module/index.js", diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts index c3cda78498..a7e880c1a7 100644 --- a/packages/react-native-executorch/src/constants/modelRegistry.ts +++ b/packages/react-native-executorch/src/constants/modelRegistry.ts @@ -276,6 +276,31 @@ const RF_DETR_NANO_SEG_VARIANTS = { }, }; +// RF-DETR Keypoint (pose estimation) — BETA preview. Configs mirror the +// All three backends ship fp32 +// (non-quantized); this entry may be re-exported under a different constant +// once more RF-DETR keypoint weights are released. +const RF_DETR_KEYPOINT_PREVIEW_VARIANTS = { + xnnpack: { + base: { + modelName: 'rfdetr-keypoint-preview' as const, + modelSource: M.RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL, + }, + }, + coreml: { + base: { + modelName: 'rfdetr-keypoint-preview' as const, + modelSource: M.RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL, + }, + }, + mlx: { + base: { + modelName: 'rfdetr-keypoint-preview' as const, + modelSource: M.RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL, + }, + }, +}; + const FASTSAM_S_VARIANTS = { xnnpack: { base: { @@ -553,6 +578,9 @@ export const models = { }, pose_estimation: { yolo26n: base(M.YOLO26N_POSE), + // BETA preview — may be re-exported under a different constant once a + // stable RF-DETR keypoint model ships. + rfdetr_keypoint_preview: variant(RF_DETR_KEYPOINT_PREVIEW_VARIANTS), }, semantic_segmentation: { deeplab_v3_resnet50: pair( diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 3f26537acc..64aca10c6a 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -731,6 +731,28 @@ export const YOLO26N_POSE = { modelSource: YOLO26N_POSE_MODEL, } as const; +// RF-DETR Keypoint (pose estimation) — BETA preview. +// NOTE: served from the `preview/` path under PREVIOUS_VERSION_TAG (shipping as +// part of a patch release). This export is a preview and may be re-exported +// under a different constant once a stable version ships. +export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`; +export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`; +export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`; +const RF_DETR_KEYPOINT_PREVIEW_MODEL = + Platform.OS === 'ios' + ? RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL + : RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL; + +/** + * @category Models - Pose Estimation + * @beta Preview export — may be re-exported under a different constant once a + * stable RF-DETR keypoint model ships. + */ +export const RF_DETR_KEYPOINT_PREVIEW = { + modelName: 'rfdetr-keypoint-preview', + modelSource: RF_DETR_KEYPOINT_PREVIEW_MODEL, +} as const; + // Style transfer /** * Builds the four `(backend, precision)` URLs for a single style-transfer style. diff --git a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts index ff2b68b1fd..34ddf45952 100644 --- a/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts +++ b/packages/react-native-executorch/src/modules/computer_vision/PoseEstimationModule.ts @@ -29,8 +29,20 @@ const YOLO_POSE_CONFIG = { defaultKeypointThreshold: 0.5, } satisfies PoseEstimationConfig; +// RF-DETR keypoint preview (BETA). Unlike yolo26n-pose's multi-method +// `forward_` export, this ships a single `forward` method — omitting +// availableInputSizes/defaultInputSize makes forward() dispatch to plain +// `forward`. May be renamed once a stable model ships. +const RFDETR_KEYPOINT_CONFIG = { + keypointMap: CocoKeypoint, + preprocessorConfig: undefined, + defaultDetectionThreshold: 0.5, + defaultKeypointThreshold: 0.5, +} satisfies PoseEstimationConfig; + const ModelConfigs = { 'yolo26n-pose': YOLO_POSE_CONFIG, + 'rfdetr-keypoint-preview': RFDETR_KEYPOINT_CONFIG, } as const satisfies Record< PoseEstimationModelName, PoseEstimationConfig diff --git a/packages/react-native-executorch/src/types/poseEstimation.ts b/packages/react-native-executorch/src/types/poseEstimation.ts index 03afc592c3..c7ae352925 100644 --- a/packages/react-native-executorch/src/types/poseEstimation.ts +++ b/packages/react-native-executorch/src/types/poseEstimation.ts @@ -62,10 +62,16 @@ export type PoseEstimationConfig = { * Each model name maps to its required fields. * @category Types */ -export type PoseEstimationModelSources = { - modelName: 'yolo26n-pose'; - modelSource: ResourceSource; -}; +export type PoseEstimationModelSources = + | { + modelName: 'yolo26n-pose'; + modelSource: ResourceSource; + } + // RF-DETR keypoint preview (BETA) — may be renamed once a stable model ships. + | { + modelName: 'rfdetr-keypoint-preview'; + modelSource: ResourceSource; + }; /** * Union of all built-in pose estimation model names.