software-mansion · NorbertKlockiewicz · Jun 18, 2026 · Jun 18, 2026
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -4,6 +4,7 @@
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <runner/constants.h>
+#include <runner/util.h>
 
 #include <executorch/extension/tensor/tensor.h>
 #include <opencv2/opencv.hpp>
@@ -70,6 +71,7 @@ Result<VisionEncoder::ImageShape> VisionEncoder::getInputShape() const {
       .height = static_cast<int32_t>(dims[offset + 1]),
       .width = static_cast<int32_t>(dims[offset + 2]),
       .with_batch = with_batch,
+      .dtype = input_meta.scalar_type(),
   };
 }
 
@@ -124,8 +126,12 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
     sizes.insert(sizes.begin(), 1);
   }
 
+  // Preprocessing produces fp32 pixels; convert to the method's declared
+  // input dtype (`shape.dtype`, already read in getInputShape). Float is a
+  // passthrough, so the common path stays copy-free.
   auto image_tensor = ::executorch::extension::from_blob(
       chw.data(), sizes, ::executorch::aten::ScalarType::Float);
+  image_tensor = ET_UNWRAP(convert_from_float(image_tensor, shape.dtype));
 
   auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
   auto out_tensor = result[0].toTensor();

diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -27,6 +27,7 @@ class VisionEncoder : public IEncoder {
   struct ImageShape {
     int32_t channels, height, width;
     bool with_batch;
+    ::executorch::aten::ScalarType dtype;
   };
 
   // The method's output EValue aliases the runtime's reusable output buffer,

diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
@@ -24,6 +24,20 @@ using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::Result;
 
+namespace {
+// Element-wise convert `count` values from `src` (Src) into the raw byte
+// buffer `dst` (interpreted as Dst). Used to splice an image-embed tensor of
+// one dtype into the fused-embeds buffer of another.
+template <typename Src, typename Dst>
+void castCopy(const void *src, uint8_t *dst, size_t count) {
+  const auto *s = static_cast<const Src *>(src);
+  auto *d = reinterpret_cast<Dst *>(dst);
+  for (size_t i = 0; i < count; ++i) {
+    d[i] = static_cast<Dst>(s[i]);
+  }
+}
+} // namespace
+
 MultimodalPrefiller::MultimodalPrefiller(
     Module &module, MultimodalDecoderRunner &decoder_runner,
     tokenizers::HFTokenizer &tokenizer,
@@ -186,24 +200,23 @@ bool MultimodalPrefiller::get_enable_dynamic_shape() const {
   uint8_t *dst = embeds_buf.data() + static_cast<size_t>(slot.slot_start) *
                                          static_cast<size_t>(hidden) *
                                          embeds_elem_size;
+  using ::executorch::aten::ScalarType;
+  const void *src = vision_tensor.const_data_ptr();
   if (vision_dtype == embeds_dtype) {
-    const uint8_t *src =
-        static_cast<const uint8_t *>(vision_tensor.const_data_ptr());
     std::memcpy(dst, src, visual_elems * embeds_elem_size);
-  } else if (vision_dtype == ::executorch::aten::ScalarType::Float &&
-             embeds_dtype == ::executorch::aten::ScalarType::Half) {
-    const float *src = vision_tensor.const_data_ptr<float>();
-    auto *dst_h = reinterpret_cast<::executorch::aten::Half *>(dst);
-    for (size_t i = 0; i < visual_elems; ++i) {
-      dst_h[i] = ::executorch::aten::Half(src[i]);
-    }
-  } else if (vision_dtype == ::executorch::aten::ScalarType::Half &&
-             embeds_dtype == ::executorch::aten::ScalarType::Float) {
-    const auto *src = vision_tensor.const_data_ptr<::executorch::aten::Half>();
-    auto *dst_f = reinterpret_cast<float *>(dst);
-    for (size_t i = 0; i < visual_elems; ++i) {
-      dst_f[i] = static_cast<float>(src[i]);
-    }
+  } else if (vision_dtype == ScalarType::Float &&
+             embeds_dtype == ScalarType::Half) {
+    castCopy<float, ::executorch::aten::Half>(src, dst, visual_elems);
+  } else if (vision_dtype == ScalarType::Half &&
+             embeds_dtype == ScalarType::Float) {
+    castCopy<::executorch::aten::Half, float>(src, dst, visual_elems);
+  } else if (vision_dtype == ScalarType::Float &&
+             embeds_dtype == ScalarType::BFloat16) {
+    // Hybrid VLM: fp32 vision encoder (e.g. XNNPACK) + bf16 decoder embeds.
+    castCopy<float, ::executorch::aten::BFloat16>(src, dst, visual_elems);
+  } else if (vision_dtype == ScalarType::BFloat16 &&
+             embeds_dtype == ScalarType::Float) {
+    castCopy<::executorch::aten::BFloat16, float>(src, dst, visual_elems);
   } else {
     ET_CHECK_OR_RETURN_ERROR(
         false, InvalidState,

diff --git a/packages/react-native-executorch/common/runner/util.h b/packages/react-native-executorch/common/runner/util.h
@@ -170,6 +170,54 @@ convert_to_bfloat16(const ::executorch::extension::TensorPtr &src_tensor) {
   return bf16_tensor;
 }
 
+/**
+ * Helper function to convert a float tensor to float16 (Half).
+ * Creates a new tensor with Half dtype and copies/converts the data.
+ */
+inline ::executorch::runtime::Result<::executorch::extension::TensorPtr>
+convert_to_float16(const ::executorch::extension::TensorPtr &src_tensor) {
+  ET_CHECK_OR_RETURN_ERROR(
+      src_tensor->scalar_type() == ::executorch::aten::ScalarType::Float,
+      InvalidArgument,
+      "Float16 conversion only supported from Float source data");
+
+  const auto num_elements = static_cast<size_t>(src_tensor->numel());
+  const float *float_data = src_tensor->const_data_ptr<float>();
+
+  auto half_tensor = ::executorch::extension::empty_like(
+      src_tensor, ::executorch::aten::ScalarType::Half);
+  auto *half_data = half_tensor->mutable_data_ptr<::executorch::aten::Half>();
+  for (size_t i = 0; i < num_elements; ++i) {
+    half_data[i] = ::executorch::aten::Half(float_data[i]);
+  }
+
+  return half_tensor;
+}
+
+/**
+ * Convert a Float tensor to `dtype` (Float passthrough, BFloat16, or Half).
+ * Used to match an exported method's declared input dtype when preprocessing
+ * produces fp32 data. Returns InvalidArgument for unsupported targets.
+ */
+inline ::executorch::runtime::Result<::executorch::extension::TensorPtr>
+convert_from_float(const ::executorch::extension::TensorPtr &src_tensor,
+                   ::executorch::aten::ScalarType dtype) {
+  using ::executorch::aten::ScalarType;
+  switch (dtype) {
+  case ScalarType::Float:
+    return src_tensor;
+  case ScalarType::BFloat16:
+    return convert_to_bfloat16(src_tensor);
+  case ScalarType::Half:
+    return convert_to_float16(src_tensor);
+  default:
+    ET_CHECK_OR_RETURN_ERROR(
+        false, InvalidArgument,
+        "Unsupported target dtype %hhd for float conversion",
+        static_cast<int8_t>(dtype));
+  }
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch

diff --git a/packages/react-native-executorch/src/constants/modelRegistry.ts b/packages/react-native-executorch/src/constants/modelRegistry.ts
@@ -260,6 +260,64 @@ const GEMMA4_E2B_MM_VARIANTS = {
   },
 };
 
+const LFM2_5_350M_VARIANTS = {
+  mlx: { base: { ...M.LFM2_5_350M, modelSource: M.LFM2_5_350M_MLX_MODEL } },
+  xnnpack: { base: M.LFM2_5_350M, quant: M.LFM2_5_350M_QUANTIZED },
+};
+
+const LFM2_5_1_2B_INSTRUCT_VARIANTS = {
+  mlx: {
+    base: {
+      ...M.LFM2_5_1_2B_INSTRUCT,
+      modelSource: M.LFM2_5_1_2B_INSTRUCT_MLX_MODEL,
+    },
+  },
+  xnnpack: {
+    base: M.LFM2_5_1_2B_INSTRUCT,
+    quant: M.LFM2_5_1_2B_INSTRUCT_QUANTIZED,
+  },
+};
+
+const LFM2_5_VL_1_6B_VARIANTS = {
+  mlx: {
+    base: {
+      ...M.LFM2_5_VL_1_6B_QUANTIZED,
+      modelSource: M.LFM2_5_VL_1_6B_MLX_MODEL,
+    },
+  },
+  xnnpack: { base: M.LFM2_5_VL_1_6B_QUANTIZED },
+};
+
+const LFM2_5_VL_450M_VARIANTS = {
+  mlx: {
+    base: {
+      ...M.LFM2_5_VL_450M_QUANTIZED,
+      modelSource: M.LFM2_5_VL_450M_MLX_MODEL,
+    },
+  },
+  xnnpack: { base: M.LFM2_5_VL_450M_QUANTIZED },
+};
+
+const PRIVACY_FILTER_OPENAI_VARIANTS = {
+  mlx: {
+    base: {
+      ...M.PRIVACY_FILTER_OPENAI,
+      modelSource: M.PRIVACY_FILTER_OPENAI_MLX_MODEL,
+    },
+  },
+  xnnpack: { base: M.PRIVACY_FILTER_OPENAI },
+};
+
+const PRIVACY_FILTER_NEMOTRON_VARIANTS = {
+  mlx: {
+    base: {
+      ...M.PRIVACY_FILTER_NEMOTRON,
+      modelSource: M.PRIVACY_FILTER_NEMOTRON_MLX_MODEL,
+    },
+  },
+  xnnpack: { base: M.PRIVACY_FILTER_NEMOTRON },
+};
+
 const EFFICIENTNET_V2_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -594,20 +652,19 @@ export const models = {
     smollm2_1_360m: pair(M.SMOLLM2_1_360M, M.SMOLLM2_1_360M_QUANTIZED),
     smollm2_1_1_7b: pair(M.SMOLLM2_1_1_7B, M.SMOLLM2_1_1_7B_QUANTIZED),
     phi_4_mini_4b: pair(M.PHI_4_MINI_4B, M.PHI_4_MINI_4B_QUANTIZED),
-    lfm2_5_350m: pair(M.LFM2_5_350M, M.LFM2_5_350M_QUANTIZED),
-    lfm2_5_1_2b_instruct: pair(
-      M.LFM2_5_1_2B_INSTRUCT,
-      M.LFM2_5_1_2B_INSTRUCT_QUANTIZED
-    ),
+    lfm2_5_350m: variant(LFM2_5_350M_VARIANTS, { ios: 'mlx' }),
+    lfm2_5_1_2b_instruct: variant(LFM2_5_1_2B_INSTRUCT_VARIANTS, {
+      ios: 'mlx',
+    }),
     bielik_v3_0_1_5b: pair(M.BIELIK_V3_0_1_5B, M.BIELIK_V3_0_1_5B_QUANTIZED),
     gemma4_e2b: variant(GEMMA4_E2B_VARIANTS, {
       ios: 'mlx',
       android: 'vulkan',
     }),
     // Multimodal LLMs — same hook/module as plain LLMs, listed here so users
     // pick a model by capability ("LLM") rather than by modality.
-    lfm2_5_vl_1_6b: base(M.LFM2_5_VL_1_6B_QUANTIZED),
-    lfm2_5_vl_450m: base(M.LFM2_5_VL_450M_QUANTIZED),
+    lfm2_5_vl_1_6b: variant(LFM2_5_VL_1_6B_VARIANTS, { ios: 'mlx' }),
+    lfm2_5_vl_450m: variant(LFM2_5_VL_450M_VARIANTS, { ios: 'mlx' }),
     gemma4_e2b_multimodal: variant(GEMMA4_E2B_MM_VARIANTS, {
       ios: 'mlx',
       android: 'vulkan',
@@ -617,8 +674,8 @@ export const models = {
     efficientnet_v2_s: variant(EFFICIENTNET_V2_S_VARIANTS),
   },
   privacy_filter: {
-    openai: base(M.PRIVACY_FILTER_OPENAI),
-    nemotron: base(M.PRIVACY_FILTER_NEMOTRON),
+    openai: variant(PRIVACY_FILTER_OPENAI_VARIANTS, { ios: 'mlx' }),
+    nemotron: variant(PRIVACY_FILTER_NEMOTRON_VARIANTS, { ios: 'mlx' }),
   },
   object_detection: {
     ssdlite_320_mobilenet_v3_large: variant(

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -450,6 +450,7 @@ export const PHI_4_MINI_4B_QUANTIZED = {
 // LFM2.5-1.2B-Instruct
 const LFM2_5_1_2B_INSTRUCT_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/1_2b/xnnpack/lfm_2_5_1_2b_xnnpack_fp16.pte`;
 const LFM2_5_1_2B_INSTRUCT_QUANTIZED_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/1_2b/xnnpack/lfm_2_5_1_2b_xnnpack_8da4w.pte`;
+export const LFM2_5_1_2B_INSTRUCT_MLX_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/1_2b/mlx/lfm_2_5_1_2b_mlx_int4.pte`;
 const LFM2_5_1_2B_TOKENIZER = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/1_2b/tokenizer.json`;
 const LFM2_5_1_2B_TOKENIZER_CONFIG = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/1_2b/tokenizer_config.json`;
 
@@ -476,6 +477,7 @@ export const LFM2_5_1_2B_INSTRUCT_QUANTIZED = {
 // LFM2.5-350M
 const LFM2_5_350M_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/350m/xnnpack/lfm_2_5_350m_xnnpack_fp16.pte`;
 const LFM2_5_350M_QUANTIZED_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/350m/xnnpack/lfm_2_5_350m_xnnpack_8da4w.pte`;
+export const LFM2_5_350M_MLX_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/350m/mlx/lfm_2_5_350m_mlx_int4.pte`;
 const LFM2_5_350M_TOKENIZER = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/350m/tokenizer.json`;
 const LFM2_5_350M_TOKENIZER_CONFIG = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/350m/tokenizer_config.json`;
 
@@ -527,11 +529,13 @@ export const BIELIK_V3_0_1_5B_QUANTIZED = {
 
 // LFM2.5-VL-1.6B
 const LFM2_VL_1_6B_QUANTIZED_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/vl_1_6b/xnnpack/lfm_2_5_vl_1_6b_xnnpack_8da4w.pte`;
+export const LFM2_5_VL_1_6B_MLX_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/vl_1_6b/mlx/lfm_2_5_vl_1_6b_mlx_int4.pte`;
 const LFM2_VL_1_6B_TOKENIZER = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/vl_1_6b/tokenizer.json`;
 const LFM2_VL_1_6B_TOKENIZER_CONFIG = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/vl_1_6b/tokenizer_config.json`;
 
 // LFM2.5-VL-450M
 const LFM2_VL_450M_QUANTIZED_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/vl_450m/xnnpack/lfm_2_5_vl_450m_xnnpack_8da4w.pte`;
+export const LFM2_5_VL_450M_MLX_MODEL = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/vl_450m/mlx/lfm_2_5_vl_450m_mlx_int4.pte`;
 const LFM2_VL_450M_TOKENIZER = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/vl_450m/tokenizer.json`;
 const LFM2_VL_450M_TOKENIZER_CONFIG = `${URL_PREFIX}-lfm-2.5/${PREVIOUS_VERSION_TAG}/vl_450m/tokenizer_config.json`;
 
@@ -1281,6 +1285,8 @@ export const PRIVACY_FILTER_OPENAI = {
   tokenizerSource: `${URL_PREFIX}-privacy-filter-openai/${PREVIOUS_VERSION_TAG}/tokenizer.json`,
 } as const;
 
+export const PRIVACY_FILTER_OPENAI_MLX_MODEL = `${URL_PREFIX}-privacy-filter-openai/${PREVIOUS_VERSION_TAG}/mlx/privacy_filter_openai_mlx_int4.pte`;
+
 /**
  * OpenMed/privacy-filter-nemotron — extended PII detector with 55 entity
  * types (adds medical, financial, identity, technical, demographic, etc.).
@@ -1293,6 +1299,8 @@ export const PRIVACY_FILTER_NEMOTRON = {
   tokenizerSource: `${URL_PREFIX}-privacy-filter-nemotron/${PREVIOUS_VERSION_TAG}/tokenizer.json`,
 } as const;
 
+export const PRIVACY_FILTER_NEMOTRON_MLX_MODEL = `${URL_PREFIX}-privacy-filter-nemotron/${PREVIOUS_VERSION_TAG}/mlx/privacy_filter_nemotron_mlx_int8.pte`;
+
 // Image generation
 
 /**