From deb429751bed75f1a9e1d578a835823241cef626 Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Mon, 25 May 2026 22:31:44 -0400
Subject: [PATCH 01/10] bumped mmif-python to 1.5.0

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1b3fc4e..07f1e9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
     "Programming Language :: Python :: 3 :: Only",
 ]
 dependencies = [
-    "mmif-python==1.4.0",
+    "mmif-python==1.5.0",
     "Flask>=2",
     "Flask-RESTful>=0.3.9",
     "gunicorn>=20",

From 962dac589ac72eed7907ea5cdb8c85ae8ebbd09c Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Fri, 22 May 2026 16:06:53 -0400
Subject: [PATCH 02/10] drafted documentation for promptable base class, added
 bare skeleton of the class

---
 clams/app/__init__.py                         | 249 ++++++++++++-
 clams/develop/templates/app/app.py.template   |  11 +-
 .../templates/app/metadata.py.template        |  28 +-
 documentation/app-baseclasses.rst             | 352 ++++++++++++++++++
 documentation/index.rst                       |   1 +
 documentation/introduction.rst                |   5 +-
 documentation/runtime-params.rst              |  18 +
 documentation/tutorial.md                     |   2 +-
 8 files changed, 656 insertions(+), 10 deletions(-)
 create mode 100644 documentation/app-baseclasses.rst

diff --git a/clams/app/__init__.py b/clams/app/__init__.py
index 1d3f0a2..94e01e4 100644
--- a/clams/app/__init__.py
+++ b/clams/app/__init__.py
@@ -9,11 +9,11 @@
 from datetime import datetime
 from urllib import parse as urlparser
 
-__all__ = ['ClamsApp']
+__all__ = ['ClamsApp', 'ClamsPromptableApp']
 
 from typing import Union, Any, Optional, Dict, List, Tuple
 
-from mmif import Mmif, Document, DocumentTypes, View
+from mmif import Mmif, Document, DocumentTypes, View, AnnotationTypes
 from mmif.utils.video_document_helper import (
     SamplingMode, SAMPLING_MODE_DESCRIPTIONS, SAMPLING_MODE_DEFAULT,
     _sampling_mode,
@@ -131,7 +131,7 @@ def _load_appmetadata(self) -> AppMetadata:
         In any case, :class:`~clams.appmetadata.AppMetadata` class must be useful.
         
         For metadata specification, 
-        see `https://sdk.clams.ai/appmetadata.jsonschema <../appmetadata.jsonschema>`_. 
+        see `https://clams.ai/clams-python/appmetadata.jsonschema <../appmetadata.jsonschema>`_. 
         """
         cwd = pathlib.Path(sys.modules[self.__module__].__file__).parent
         
@@ -639,6 +639,249 @@ def open_document_location(document: Union[str, Document], opener: Any = open, *
                     raise FileNotFoundError(p.path)
 
 
+class ClamsPromptableApp(ClamsApp):
+    """
+    Base class for CLAMS apps that wrap a promptable model (an LLM or
+    other multimodal model, local or remote). Standardizes the runtime
+    parameter surface
+    (prompt, generation hyperparameters, batch size) and provides
+    helpers for building chat conversations and persisting model
+    responses into MMIF.
+
+    The standardized parameters are listed in
+    :py:attr:`promptable_parameters` and added to an app's metadata via
+    :py:meth:`inject_promptable_parameters`. Promptable-app developers
+    MUST call that helper at the end of their ``appmetadata()`` function
+    in ``metadata.py``. The reservation rule (these parameter names are
+    SDK-managed and apps cannot redeclare them) is enforced implicitly
+    via :py:meth:`AppMetadata.add_parameter`'s existing duplicate-name
+    check.
+
+    Inference is performed by :py:meth:`generate`, which subclasses MUST
+    implement. The base class provides:
+
+    * :py:meth:`inject_promptable_parameters` — add the SDK-managed
+      parameter set to ``AppMetadata``
+    * :py:meth:`build_conversation` — assemble a chat-template-compatible
+      message list (stub in this release)
+    * :py:meth:`store_response` — persist a generated response into a
+      view as ``TextDocument`` + ``Alignment``
+    """
+
+    #: SDK-managed runtime parameters injected into every promptable app.
+    #: These names are reserved — apps cannot redeclare them with
+    #: customized specs.
+    promptable_parameters = [
+        {
+            'name': 'prompt', 'type': 'string', 'multivalued': True,
+            'description':
+                'User prompt(s) sent to the model. A single value runs as a '
+                'one-shot generation. A multi-value list is interpreted as a '
+                'multi-turn static prompt; see ``promptMode`` for how turns '
+                'are assembled.',
+        },
+        {
+            'name': 'systemPrompt', 'type': 'string', 'default': '',
+            'description':
+                'Optional system-role text prepended to the conversation. '
+                'Empty by default.',
+        },
+        {
+            'name': 'promptMode', 'type': 'string',
+            'choices': ['user-only', 'turn-taking'],
+            'default': 'turn-taking',
+            'description':
+                'How to interpret a multi-value ``prompt`` list. '
+                'Has no effect when ``prompt`` has a single value. '
+                'For semantics of each choice and worked examples, see '
+                'https://clams.ai/clams-python/app-baseclasses.html#promptable-multiturn',
+        },
+        {
+            'name': 'maxNewTokens', 'type': 'integer', 'default': 512,
+            'description':
+                'Maximum number of new tokens generated per inference call. '
+                'Forwarded to the backend\'s ``generate``-equivalent. Larger '
+                'values grow the KV cache linearly and increase GPU memory '
+                'usage; reduce if VRAM is constrained.',
+        },
+        {
+            'name': 'temperature', 'type': 'number', 'default': 0.0,
+            'description':
+                'Sampling temperature. The default ``0.0`` selects '
+                'deterministic / greedy decoding for maximum reproducibility; '
+                'override for sampled generation.',
+        },
+        {
+            'name': 'topP', 'type': 'number', 'default': 1.0,
+            'description':
+                'Nucleus-sampling cumulative probability cutoff. Only '
+                'meaningful when ``temperature`` is greater than 0.',
+        },
+        {
+            'name': 'topK', 'type': 'integer', 'default': 50,
+            'description':
+                'Top-K sampling cutoff. Only meaningful when ``temperature`` '
+                'is greater than 0.',
+        },
+        {
+            'name': 'batchSize', 'type': 'integer', 'default': 1,
+            'description':
+                'How many input items the app groups per ``generate`` call. '
+                'GPU memory scales roughly linearly with batch size; raise '
+                'for throughput on GPUs with headroom, keep at ``1`` on '
+                'memory-tight setups.',
+        },
+    ]
+
+    @staticmethod
+    def inject_promptable_parameters(metadata: AppMetadata) -> None:
+        """
+        Add the SDK-managed promptable parameters to ``metadata``. Call
+        this at the end of your app's ``appmetadata()`` function in
+        ``metadata.py`` if your app subclasses
+        :py:class:`ClamsPromptableApp`.
+
+        The reservation rule is enforced implicitly: if the app had
+        already called ``metadata.add_parameter('prompt', ...)`` (or
+        any other promptable name) before this helper, the helper's own
+        ``add_parameter`` call will trip the existing duplicate-name
+        ``ValueError`` in :py:meth:`AppMetadata.add_parameter`.
+
+        :param metadata: the :class:`AppMetadata` instance being built
+        """
+        for param in ClamsPromptableApp.promptable_parameters:
+            metadata.add_parameter(**param)
+
+    def __init__(self):
+        # ``ClamsApp.__init__`` loads the app's ``metadata.py``, which
+        # is expected to have already called
+        # ``inject_promptable_parameters()`` from inside
+        # ``appmetadata()``. The parent ``__init__`` then iterates
+        # ``self.metadata.parameters`` to populate
+        # ``annotate_param_spec`` and build the caster — so the
+        # promptable parameters are already covered by the time we land
+        # here. We only validate that the helper was actually called.
+        super().__init__()
+        declared = {p.name for p in self.metadata.parameters}
+        expected = {p['name'] for p in ClamsPromptableApp.promptable_parameters}
+        missing = expected - declared
+        if missing:
+            raise ValueError(
+                f"Promptable parameters {sorted(missing)} are missing "
+                f"from the app metadata. Promptable apps must call "
+                f"``ClamsPromptableApp.inject_promptable_parameters("
+                f"metadata)`` inside their ``appmetadata()`` function "
+                f"in ``metadata.py``."
+            )
+
+    @abstractmethod
+    def generate(
+            self,
+            prompt: List[str],
+            system_prompt: str = '',
+            images: Optional[List[Any]] = None,
+            audio: Optional[List[Any]] = None,
+            prompt_mode: str = 'turn-taking',
+            batch_size: int = 1,
+            **generation_params,
+    ) -> List[str]:
+        """
+        Run inference on the given prompt against the given inputs.
+        Subclasses MUST implement this.
+
+        The return value is a flat list of strings: one entry per input
+        item (one per image when ``images`` is given, one per audio clip
+        when ``audio`` is given, or a singleton for text-only
+        single-shot generation).
+
+        :param prompt: a ``List[str]`` of prompt turns. A single-element
+            list is one-shot. A multi-element list is multi-turn and is
+            assembled according to ``prompt_mode``.
+        :param system_prompt: optional system-role text prepended to the
+            conversation
+        :param images: optional list of input images to broadcast across
+            the prompt (one generation per image)
+        :param audio: optional list of input audio clips
+        :param prompt_mode: ``"turn-taking"`` (default) or ``"user-only"``;
+            see :py:attr:`promptable_parameters`
+        :param batch_size: max number of items per underlying
+            ``generate`` call
+        :param generation_params: any additional backend-specific
+            generation kwargs (``maxNewTokens``, ``temperature``,
+            ``topP``, ``topK``, etc.)
+        :return: one generated string per input item
+        :rtype: List[str]
+        """
+        raise NotImplementedError
+
+    def build_conversation(
+            self,
+            prompt: Union[str, List[str]],
+            system_prompt: str = '',
+            images: Optional[List[Any]] = None,
+            audio: Optional[List[Any]] = None,
+            prompt_mode: str = 'turn-taking',
+    ) -> Union[List[dict], List[List[dict]]]:
+        """
+        Build a chat-template-compatible message list (or a list of
+        message lists for ``user-only`` mode).
+
+        Defined as an instance method so subclasses can override and
+        access model-specific state (``self.processor``,
+        ``self.tokenizer``, etc.) when formatting messages.
+
+        .. note::
+           The base implementation is a stub in this release and raises
+           :py:class:`NotImplementedError`. Subclasses must override.
+           A default implementation will be added in a follow-up.
+        """
+        raise NotImplementedError(
+            "ClamsPromptableApp.build_conversation() is a stub in this "
+            "release. Override it in your subclass, or wait for the base "
+            "implementation in a follow-up."
+        )
+
+    def store_response(
+            self,
+            view: View,
+            source: str,
+            answer: str,
+            trace: Optional[str] = None,
+    ) -> Tuple[Any, Any]:
+        """
+        Persist a generated response into a view as a
+        ``TextDocument`` (containing ``answer``) plus an
+        ``Alignment`` linking ``source`` to the new TextDocument.
+
+        :param view: the :class:`View` to write into. The caller is
+            responsible for having called
+            :meth:`View.new_contain` for ``TextDocument`` and
+            ``Alignment`` first if needed.
+        :param source: ``long_id`` of the source annotation that
+            produced the response (e.g. a ``TimePoint`` or
+            ``ImageDocument``)
+        :param answer: the text generated by the model
+        :param trace: optional reasoning trace. NOT YET SUPPORTED —
+            passing a non-``None`` value raises
+            :py:class:`NotImplementedError`. Storage convention is
+            still being decided at
+            clamsproject/clams-python#263.
+        :return: ``(TextDocument, Alignment)`` tuple of the new
+            annotations
+        """
+        td = view.new_textdocument(text=answer)
+        align = view.new_annotation(
+            AnnotationTypes.Alignment,
+            properties={'source': source, 'target': td.long_id},
+        )
+        if trace is not None:
+            raise NotImplementedError(
+                "Reasoning-trace storage convention is not yet defined; "
+                "tracked at clamsproject/clams-python#263."
+            )
+        return td, align
+
+
 class ParameterCaster(object):
 
     def __init__(self, param_spec: Dict[str, Tuple[str, bool]]):
diff --git a/clams/develop/templates/app/app.py.template b/clams/develop/templates/app/app.py.template
index a7a4cc5..d307ab4 100644
--- a/clams/develop/templates/app/app.py.template
+++ b/clams/develop/templates/app/app.py.template
@@ -24,19 +24,26 @@ from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentType
 from lapps.discriminators import Uri
 
 
+# If your app is a prompt-driven LLM/VLM/audio-LM app, change the base class
+# below from ``ClamsApp`` to ``ClamsPromptableApp`` (import via
+# ``from clams import ClamsPromptableApp``) and implement ``generate()``
+# instead of (or in addition to) ``_annotate()``. Don't forget to also
+# uncomment the matching helper call in ``metadata.py``. See
+# https://clams.ai/clams-python/app-baseclasses.html#promptable for the
+# developer guide.
 class $APP_CLASS_NAME(ClamsApp):
 
     def __init__(self):
         super().__init__()
 
     def _appmetadata(self):
-        # see https://sdk.clams.ai/autodoc/clams.app.html#clams.app.ClamsApp._load_appmetadata
+        # see https://clams.ai/clams-python/autodoc/clams.app.html#clams.app.ClamsApp._load_appmetadata
         # Also check out ``metadata.py`` in this directory.
         # When using the ``metadata.py`` leave this do-nothing "pass" method here.
         pass
 
     def _annotate(self, mmif: Mmif, **parameters) -> Mmif:
-        # see https://sdk.clams.ai/autodoc/clams.app.html#clams.app.ClamsApp._annotate
+        # see https://clams.ai/clams-python/autodoc/clams.app.html#clams.app.ClamsApp._annotate
         raise NotImplementedError
 
 def get_app():
diff --git a/clams/develop/templates/app/metadata.py.template b/clams/develop/templates/app/metadata.py.template
index 93aec79..f9596fd 100644
--- a/clams/develop/templates/app/metadata.py.template
+++ b/clams/develop/templates/app/metadata.py.template
@@ -16,8 +16,8 @@ def appmetadata() -> AppMetadata:
     """
     Function to set app-metadata values and return it as an ``AppMetadata`` obj.
     Read these documentations before changing the code below
-    - https://sdk.clams.ai/appmetadata.html metadata specification.
-    - https://sdk.clams.ai/autodoc/clams.appmetadata.html python API
+    - https://clams.ai/clams-python/appmetadata.html metadata specification.
+    - https://clams.ai/clams-python/autodoc/clams.appmetadata.html python API
     
     :return: AppMetadata object holding all necessary information.
     """
@@ -51,7 +51,29 @@ def appmetadata() -> AppMetadata:
     metadata.add_parameter(name='a_param', description='example parameter description',
                            type='boolean', default='false')
     # metadta.add_parameter(more...)
-    
+
+    # If your app subclasses ``ClamsPromptableApp`` (a prompt-driven LLM/VLM/audio-LM
+    # app), uncomment the following two lines to add the SDK-managed promptable
+    # parameters (prompt, systemPrompt, temperature, maxNewTokens, etc.) to your
+    # app's metadata. See
+    # https://clams.ai/clams-python/app-baseclasses.html#promptable for the
+    # developer guide. Reminder: these parameter names are reserved by the SDK
+    # — do not redeclare any of them above.
+    # from clams.app import ClamsPromptableApp
+    # ClamsPromptableApp.inject_promptable_parameters(metadata)
+    #
+    # To customize the default value of any promptable parameter (e.g. provide an
+    # app-specific default ``prompt``, raise ``maxNewTokens``, pin ``batchSize``,
+    # etc.), mutate the ``default`` field on the already-injected parameter
+    # object — the SDK does NOT allow re-declaring promptable param names. See
+    # https://clams.ai/clams-python/app-baseclasses.html#promptable-customizing-defaults
+    # for details. Example:
+    # for p in metadata.parameters:
+    #     if p.name == 'prompt':
+    #         p.default = ['Describe what is in this image.']
+    #     elif p.name == 'maxNewTokens':
+    #         p.default = 2048
+
     # CHANGE this line and make sure return the compiled `metadata` instance
     return None
 
diff --git a/documentation/app-baseclasses.rst b/documentation/app-baseclasses.rst
new file mode 100644
index 0000000..2e325fd
--- /dev/null
+++ b/documentation/app-baseclasses.rst
@@ -0,0 +1,352 @@
+.. _app-baseclasses:
+
+Specialized App Base Classes
+============================
+
+Beyond the bare-minimum :class:`~clams.app.ClamsApp` introduced in
+:ref:`introduction`, the SDK provides specialized base classes that capture
+common structural patterns for CLAMS apps. Each specialized base class
+extends :class:`~clams.app.ClamsApp` with a standardized runtime parameter
+surface and helper methods appropriate to its category of app. App
+developers inherit from the specialized base class that best matches what
+their app does, instead of inheriting from :class:`~clams.app.ClamsApp`
+directly.
+
+This page first recaps what every CLAMS app inherits from
+:class:`~clams.app.ClamsApp` (the baseline), then documents each
+specialized base class and what it adds on top.
+
+.. _app-baseline:
+
+What every CLAMS app inherits
+-----------------------------
+
+Every CLAMS app subclasses :class:`~clams.app.ClamsApp` (directly or via
+a specialized base class such as :class:`~clams.app.ClamsPromptableApp`)
+and inherits its baseline behaviors: parameter casting and refinement,
+view signing, JSON envelope unwrapping, CUDA memory profiling and
+cleanup, error views, and a set of *universal* runtime parameters that
+the SDK auto-injects into every app's metadata.
+
+Universal parameters
+^^^^^^^^^^^^^^^^^^^^
+
+Added automatically by :meth:`~clams.app.ClamsApp.__init__` at runtime
+and by the standard ``metadata.py`` template's ``__main__`` block at
+``python metadata.py`` time. App developers do not declare them.
+
+.. list-table::
+   :header-rows: 1
+   :widths: 18 12 18 8 44
+
+   * - Name
+     - Type
+     - Default
+     - Multi-valued
+     - Notes
+   * - ``pretty``
+     - boolean
+     - ``false``
+     - no
+     - When ``true``, the response MMIF JSON is re-formatted with
+       2-space indentation.
+   * - ``runningTime
+     - boolean
+     - ``true``
+     - no
+     - When ``true``, the running time of the request is recorded in
+       the view metadata.
+   * - ``hwFetch``
+     - boolean
+     - ``false``
+     - no
+     - When ``true``, host hardware info (architecture, GPU and vRAM)
+       is recorded in the view metadata.
+   * - ``tfSamplingMode``
+     - string
+     - ``'representatives'``
+     - no
+     - For apps that process ``TimeFrame`` annotations: how to sample
+       frames within each TimeFrame. Choices: ``'representatives'``,
+       ``'single'``, ``'all'``. No effect on apps that do not process
+       TimeFrames.
+
+.. _sdk-managed-reserved:
+
+SDK-managed parameter names are reserved
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Parameter names added by the SDK (the universal parameters listed
+above, plus any parameters added by a specialized base class) are
+reserved. An app's ``appmetadata()`` MUST NOT declare any of these
+names via :meth:`AppMetadata.add_parameter` directly; doing so trips
+the existing duplicate-name ``ValueError`` when the SDK tries to add
+its own spec.
+
+This reservation guarantees a uniform, predictable parameter interface
+across all CLAMS apps. App developers can still customize a reserved
+parameter's *default value* (but not its ``type``, ``multivalued``, or
+``choices``) by mutating the ``default`` field on the already-injected
+parameter object; see :ref:`promptable-customizing-defaults` for a
+worked example.
+
+.. _promptable:
+
+Promptable CLAMS Apps
+---------------------
+
+A **promptable app** is a CLAMS app that wraps a promptable model: a large
+language model (LLM), vision-language model (VLM), audio-language model
+(ALM), large multimodal model (LMM), or remote generative API. The SDK
+provides :class:`~clams.app.ClamsPromptableApp` as a specialized base class
+for these apps. It standardizes the runtime parameter surface (prompts,
+generation hyperparameters, batch size) and provides helpers for building
+chat conversations and persisting model responses into MMIF.
+
+This section is the developer guide for writing or migrating a CLAMS app
+that inherits from :class:`~clams.app.ClamsPromptableApp`. For the general
+CLAMS app development pattern, see the :ref:`introduction`,
+:ref:`tutorial`, and :ref:`runtime-params` pages.
+
+When to use ``ClamsPromptableApp``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Choose :class:`~clams.app.ClamsPromptableApp` over :class:`~clams.app.ClamsApp`
+when your app's core operation is "given a prompt and some input
+(image/audio/text/structured data), return generated text." Concretely:
+
+- Image captioning, VLM-based OCR, scene description
+- Audio captioning, transcription via ALMs
+- Summarization, classification, structured-data extraction via LLMs
+- Tasks driven by an LMM that takes mixed-modality inputs
+- Any app that wraps a remote LLM, VLM, ALM, or LMM API and forwards a prompt
+
+If your app does not call a generative model (e.g. a classical OCR engine,
+a speech-to-text engine that doesn't take prompts, a classifier wrapping a
+discriminative model), keep using :class:`~clams.app.ClamsApp` directly.
+
+.. note::
+
+   ``ClamsPromptableApp`` assumes an **instruction-tuned or chat-tuned**
+   model: one that has been fine-tuned to follow natural-language
+   instructions and that understands a system/user/assistant role
+   structure. The parameter
+   surface (``systemPrompt``, ``promptMode``'s turn-taking semantics, the
+   chat-template message list produced by ``build_conversation``) presupposes
+   this. Bare completion / next-token-prediction base models that have not
+   been instruction-tuned do not fit this base class cleanly; for those, use
+   :class:`~clams.app.ClamsApp` directly and design your own parameter surface.
+
+Standardized runtime parameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Every :class:`~clams.app.ClamsPromptableApp` exposes the following
+SDK-managed runtime parameters in addition to the universal parameters
+from :class:`~clams.app.ClamsApp`. These names are reserved; see
+:ref:`sdk-managed-reserved`.
+
+.. list-table::
+   :header-rows: 1
+   :widths: 18 12 18 8 44
+
+   * - Name
+     - Type
+     - Default
+     - Multi-valued
+     - Notes
+   * - ``prompt``
+     - string
+     - *(required, no default)*
+     - yes
+     - User prompt(s) sent to the model. A single value runs as a one-shot
+       generation. A multi-value list is interpreted as a multi-turn static
+       prompt; see :ref:`promptable-multiturn`.
+   * - ``systemPrompt``
+     - string
+     - ``''``
+     - no
+     - Optional system-role text prepended to the conversation.
+   * - ``promptMode``
+     - string
+     - ``'turn-taking'``
+     - no
+     - How to interpret a multi-value ``prompt`` list. Choices:
+       ``'turn-taking'`` or ``'user-only'``. See :ref:`promptable-multiturn`.
+   * - ``maxNewTokens``
+     - integer
+     - ``512``
+     - no
+     - Maximum number of new tokens generated per inference call. Larger values
+       grow the KV cache linearly and add to GPU memory usage; reduce if VRAM
+       is constrained.
+   * - ``temperature``
+     - number
+     - ``0.0``
+     - no
+     - Sampling temperature. ``0.0`` selects deterministic / greedy decoding
+       for maximum reproducibility; override for sampled generation.
+   * - ``topP``
+     - number
+     - ``1.0``
+     - no
+     - Nucleus-sampling cumulative probability cutoff. Only meaningful when
+       ``temperature`` > 0.
+   * - ``topK``
+     - integer
+     - ``50``
+     - no
+     - Top-K sampling cutoff. Only meaningful when ``temperature`` > 0.
+   * - ``batchSize``
+     - integer
+     - ``1``
+     - no
+     - How many input items the app groups per ``generate`` call. GPU memory
+       scales roughly linearly with batch size; raise for throughput on
+       GPUs with headroom, keep at ``1`` on memory-tight setups.
+
+.. _promptable-customizing-defaults:
+
+Customizing default values
+""""""""""""""""""""""""""
+
+The SDK ships sensible defaults for most promptable parameters but
+deliberately leaves ``prompt`` **without** a default; prompts are
+inherently app-specific and no single value is right for all apps.
+Beyond ``prompt``, other defaults may also be inappropriate for a given
+app: a model that needs longer outputs wants a higher ``maxNewTokens``,
+a small-VRAM deployment wants ``batchSize`` pinned at ``1``, etc.
+
+Because the reservation rule prevents calling
+``metadata.add_parameter('prompt', ...)`` (or any other promptable name)
+directly, the recommended pattern for customizing defaults is to mutate
+the ``default`` field on the already-injected parameter object right
+after calling :meth:`~clams.app.ClamsPromptableApp.inject_promptable_parameters`.
+You'll see a worked example of this in the ``metadata.py`` generated
+by the ``clams develop`` scaffold.
+
+This works for any promptable parameter. The parameter spec itself
+(``type``, ``multivalued``, ``choices``) stays locked by the SDK; only
+the ``default`` field is meant to be mutated this way, which preserves
+the cross-app uniformity that the reservation rule is designed to
+guarantee.
+
+If an app *wants* to require callers to pass a value explicitly (for
+``prompt`` or any other parameter), it can simply leave the default
+unchanged. ``prompt`` already has no default, so the SDK will raise a
+"required parameter" error if the caller omits it; for other params,
+deleting the SDK default and leaving it ``None`` would have the same
+effect, though that's rarely useful.
+
+.. _promptable-declaration:
+
+Declaring a promptable app
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A promptable app requires two paired edits relative to the scaffold generated
+by ``clams develop``:
+
+1. In ``app.py``, change the app class's base from :class:`~clams.app.ClamsApp`
+   to :class:`~clams.app.ClamsPromptableApp` and implement
+   :meth:`~clams.app.ClamsPromptableApp.generate`. The scaffold file already
+   contains a guiding comment at the class declaration line.
+2. In ``metadata.py``, call
+   :meth:`~clams.app.ClamsPromptableApp.inject_promptable_parameters` at the
+   end of ``appmetadata()``. The scaffold file already contains a
+   commented-out helper-call block; uncomment it.
+
+The ``__main__`` block in ``metadata.py`` does NOT change; it stays identical
+to non-promptable apps.
+
+The helper call inside ``appmetadata()`` makes the promptable parameters
+visible to both ``python metadata.py`` (build-time discovery) and to
+:meth:`~clams.app.ClamsApp._load_appmetadata` (runtime). The base class
+change ensures the app inherits the parameter-presence validation, the
+abstract ``generate()`` contract, and the helper methods at runtime.
+
+The ``generate()`` contract
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Subclasses MUST implement :meth:`~clams.app.ClamsPromptableApp.generate`.
+See the method's docstring for the full signature and parameter semantics.
+
+The return value is a flat ``List[str]``: one entry per input item (one per
+image when ``images`` is given, one per audio clip when ``audio`` is given,
+or a single-element list for text-only single-shot generation). Keep
+inference logic inside ``generate()`` distinct from MMIF I/O; the latter
+belongs in ``_annotate()`` (which calls ``self.generate()``).
+
+This separation is intentional: future SDK releases may provide default
+implementations of ``generate()`` for common backends, at which point apps
+that kept inference and annotation creation separate will need no changes.
+
+.. _promptable-multiturn:
+
+Multi-turn handling (``promptMode``)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``prompt`` is always a ``List[str]`` after parameter casting. When the
+list has a single element, ``promptMode`` is irrelevant (single-shot
+generation). When the list has multiple elements, ``promptMode`` selects
+between two multi-element prompting strategies:
+
+**Turn-taking** (default). The list is interpreted as an alternating
+user/assistant conversation: even indices (0, 2, 4, ...) are user turns,
+odd indices are assistant turns. The full conversation is sent to the
+model in a single ``generate`` call. This mode supports any pattern
+that fits an alternating role structure, including few-shot in-context
+learning (where the (user, assistant) pairs are task exemplars and the
+final user turn is the new query), multi-turn dialogue continuation,
+and role-play scaffolding. Example (few-shot ICL): ``["Classify
+sentiment: 'I love this.'", "positive", "Classify sentiment: 'I hate
+this.'", "negative", "Classify sentiment: 'It's okay.'"]``: two
+exemplar pairs followed by a final query; one inference returns the
+final reply.
+
+**User-only**. Every element is a user turn; the model generates an
+assistant reply between each, in N successive ``generate`` calls. Only
+the final assistant response is returned per input item. This mode
+implements iterative / scripted multi-step prompting, a manual,
+externally-driven scaffold for stepwise reasoning. (It is distinct
+from in-model zero-shot chain-of-thought, where stepwise reasoning is
+elicited inside a single inference call by a prompt like "let's think
+step by step"; here, the user-side scaffolding makes the steps
+explicit and feeds each intermediate model output back as context for
+the next user turn.) Example (scripted multi-step reasoning):
+``["Step 1: identify objects.", "Step 2: describe relationships.",
+"Step 3: conclude."]``: three sequential user prompts, three
+inferences, final reply returned.
+
+``turn-taking`` is the default because it costs a single inference call
+and is the more common multi-element pattern.
+
+Helpers
+^^^^^^^
+
+:meth:`~clams.app.ClamsPromptableApp.inject_promptable_parameters`
+    A static method called from your app's ``appmetadata()`` (in
+    ``metadata.py``) to add the SDK-managed promptable parameters.
+
+:meth:`~clams.app.ClamsPromptableApp.build_conversation`
+    Instance method that constructs a chat-template-compatible message list
+    (or list of message lists for ``user-only`` mode). Subclasses may
+    override to access model-specific state (e.g. ``self.processor``).
+    Currently a stub; a default implementation is planned for a follow-up
+    release.
+
+:meth:`~clams.app.ClamsPromptableApp.store_response`
+    Helper for the common annotation-creation pattern: given a view, a
+    source annotation's ``long_id``, and a generated string, creates a
+    ``TextDocument`` containing the text plus an ``Alignment`` linking
+    source to TextDocument; returns the ``(text_document, alignment)``
+    pair. The optional ``trace`` parameter is reserved for
+    reasoning-trace storage; passing a non-``None`` value currently
+    raises :class:`NotImplementedError` (storage convention tracked in
+    clamsproject/clams-python#263).
+
+Backend helpers
+^^^^^^^^^^^^^^^
+
+For apps wrapping a local HuggingFace transformers model, the SDK provides
+a loading helper in ``clams.backends.hf``. *Documentation for the HF
+backend helper will be added in a follow-up release; see
+clamsproject/clams-python#263 for status.*
diff --git a/documentation/index.rst b/documentation/index.rst
index 9f6db33..5882e11 100644
--- a/documentation/index.rst
+++ b/documentation/index.rst
@@ -16,6 +16,7 @@ CLAMS Python SDK
   introduction
   input-output
   runtime-params
+  app-baseclasses
   gpu-apps
   appmetadata
   appdirectory
diff --git a/documentation/introduction.rst b/documentation/introduction.rst
index ce907e8..f2d420f 100644
--- a/documentation/introduction.rst
+++ b/documentation/introduction.rst
@@ -72,13 +72,16 @@ As a developer you can expose different behaviors of the ``annotate()`` method b
   These runtime configurations are not part of the MMIF input, but for reproducible analysis, you should record these configurations in the output MMIF. 
 
 .. note::
-  There are *universal* parameters defined at the SDK-level that all CLAMS apps commonly use. See :const:`clams.app.ClamsApp.universal_parameters`. 
+  Some runtime parameters are managed by the SDK itself rather than declared per-app. The *universal* parameters in :const:`clams.app.ClamsApp.universal_parameters` are one such set — they are auto-added to every CLAMS app. Specialized base classes (see below) add their own SDK-managed parameter sets on top.
 
 .. warning::
   All the runtime configurations should be pre-announced in the app metadata.
 
 Also see <:doc:`tutorial`> for a step-by-step tutorial on how to write the ``_annotate()`` method with a simple example NLP tool.
 
+.. note::
+  Inheriting from :class:`~clams.app.ClamsApp` directly works for any CLAMS app. For common app categories (e.g. apps wrapping LLM or other multimodal models), the SDK provides specialized base classes that extend :class:`~clams.app.ClamsApp` with additional SDK-managed parameter sets and helpers. See :ref:`app-baseclasses`.
+
 appmetadata()
 """""""""""""
 
diff --git a/documentation/runtime-params.rst b/documentation/runtime-params.rst
index 4d3bf93..47aa8a5 100644
--- a/documentation/runtime-params.rst
+++ b/documentation/runtime-params.rst
@@ -190,6 +190,24 @@ For more complex value structures (e.g., comma-separated lists within values),
 the app developer is responsible for further parsing and should document the
 expected format in the parameter's ``description`` field.
 
+.. _runtime-params-promptable-note:
+
+Promptable apps: an extra SDK-managed parameter set
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For apps that wrap an **instruction- or chat-tuned** promptable model — an
+LLM or other multimodal model, local or remote — we recommend inheriting
+from :class:`~clams.app.ClamsPromptableApp` instead of
+:class:`~clams.app.ClamsApp`. The promptable base class adds a standardized,
+SDK-managed set of runtime parameters (``prompt``, ``systemPrompt``,
+``temperature``, ``maxNewTokens``, ``topP``, ``topK``, ``promptMode``,
+``batchSize``) on top of the universal parameters. If you use this base
+class, these names are reserved — your app's ``metadata.py`` must not
+redeclare them — and are added via a single helper call inside
+``appmetadata()``.
+
+See :ref:`promptable` for the full developer guide.
+
 .. _runtime-params-envelope-note:
 
 Note on JSON envelope input
diff --git a/documentation/tutorial.md b/documentation/tutorial.md
index b7f7469..58ae746 100644
--- a/documentation/tutorial.md
+++ b/documentation/tutorial.md
@@ -146,7 +146,7 @@ This means that if the user doesn't specify the value for these parameters at th
 If you want to make a parameter "optional" by providing a default value, you can do so by adding a `default` argument to the `add_parameter()` method.
 
 > **Note**
-> Also refer to [CLAMS App Metadata](https://sdk.clams.ai/appmetadata.html) for more details regarding what fields need to be specified.
+> Also refer to [CLAMS App Metadata](https://clams.ai/clams-python/appmetadata.html) for more details regarding what fields need to be specified.
 
 #### `_annotate()`
 The `_annotate()` method should accept a MMIF file/string/object as its first parameter and always returns a `MMIF` object with an additional `view` containing annotation results. This is where the bulk of your logic will go. For a text processing app, it is mostly concerned with finding text documents, calling the code that runs over the text, creating new views and inserting the results. 

From 48838161e1b226f229d206a53448c71d7dd5f5fb Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Sun, 24 May 2026 14:44:28 -0400
Subject: [PATCH 03/10] updated documentation for HF backend refactoring

---
 documentation/app-baseclasses.rst | 85 +++++++++++++++++++++++++++++--
 pyproject.toml                    |  3 ++
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/documentation/app-baseclasses.rst b/documentation/app-baseclasses.rst
index 2e325fd..488fa11 100644
--- a/documentation/app-baseclasses.rst
+++ b/documentation/app-baseclasses.rst
@@ -50,7 +50,7 @@ and by the standard ``metadata.py`` template's ``__main__`` block at
      - no
      - When ``true``, the response MMIF JSON is re-formatted with
        2-space indentation.
-   * - ``runningTime
+   * - ``runningTime``
      - boolean
      - ``true``
      - no
@@ -346,7 +346,82 @@ Helpers
 Backend helpers
 ^^^^^^^^^^^^^^^
 
-For apps wrapping a local HuggingFace transformers model, the SDK provides
-a loading helper in ``clams.backends.hf``. *Documentation for the HF
-backend helper will be added in a follow-up release; see
-clamsproject/clams-python#263 for status.*
+The SDK provides optional helper utilities for loading common
+inference backends, so apps don't have to write model-loading
+boilerplate themselves. Backends are kept as separate subpackages
+under ``clams.backends`` and their heavy dependencies are NOT pulled
+in by the base ``clams-python`` install; you opt in via a pip extra
+when your app needs the backend.
+
+.. _backends-hf:
+
+HuggingFace transformers (``clams.backends.hf``)
+""""""""""""""""""""""""""""""""""""""""""""""""
+
+:func:`clams.backends.hf.load_hf_model` loads any local HuggingFace
+``transformers`` model via ``from_pretrained()`` and returns it ready
+for inference. It encapsulates the device, processor/tokenizer, and
+inference-mode boilerplate that every HF-backed app needs to do
+identically:
+
+- detects an available CUDA device and falls back to CPU when none is
+  present
+- loads the caller-supplied ``processor_cls`` (defaults to
+  :class:`~transformers.AutoProcessor`; pass
+  :class:`~transformers.AutoTokenizer`,
+  :class:`~transformers.AutoImageProcessor`, etc. for narrower or
+  more specific cases)
+- loads the model via the caller-supplied ``model_cls``
+- moves the model to the resolved device and switches it to ``eval()``
+  mode
+- when ``padding_side`` is given (decoder-only / batched-generation
+  case), configures the tokenizer's padding side and uses the EOS
+  token as the pad token; left as the model's own default otherwise
+
+The function signature is::
+
+    load_hf_model(
+        model_id: str,
+        model_cls,                              # e.g. AutoModelForCausalLM, AutoModelForImageTextToText, ConvNextV2Model, ViTModel, ...
+        processor_cls = None,                   # default AutoProcessor; pass AutoTokenizer / AutoImageProcessor / ... for narrower cases, or None to skip processor loading
+        dtype = None,                           # None leaves the model's own default (typically float32); set explicitly (e.g., torch.bfloat16) for LLMs
+        device: Optional[str] = None,           # auto-detected when None
+        padding_side: Optional[str] = None,     # set to 'left' for decoder-only batched generation; leave None for encoder / non-batched cases
+        model_kwargs: Optional[dict] = None,    # extra kwargs forwarded to model_cls.from_pretrained()
+        processor_kwargs: Optional[dict] = None,  # extra kwargs forwarded to processor_cls.from_pretrained()
+    ) -> Tuple[processor, model, device_str]
+
+The ``model_kwargs`` and ``processor_kwargs`` pass-throughs cover the
+common ``from_pretrained()`` options that vary between model classes
+and use cases: ``use_safetensors``, ``use_fast``,
+``add_pooling_layer``, ``trust_remote_code``, ``revision``, etc.
+
+An app's ``__init__`` typically calls this helper once and stores the
+returned ``processor`` (or ``tokenizer`` / ``image_processor``),
+``model``, and ``device`` on ``self`` for use inside its inference
+method (e.g., :meth:`~clams.app.ClamsPromptableApp.generate`). See the
+function's docstring for the full parameter reference and return
+value.
+
+Promptable apps wrapping a decoder-only / chat-tuned model typically
+pass ``padding_side='left'`` and an explicit dtype like
+``torch.bfloat16``; encoder-side HF apps (e.g., a vision feature
+extractor + classifier head) leave both at the defaults and pass any
+class-specific kwargs through ``model_kwargs`` /
+``processor_kwargs``.
+
+Installation
+~~~~~~~~~~~~
+
+``torch`` and ``transformers`` are NOT included in the base
+``clams-python`` install (to keep the SDK lightweight for apps that
+don't need them). When your app uses the HF backend, install with the
+``hf`` extra::
+
+    pip install clams-python[hf]
+
+The helper module imports ``torch`` and ``transformers`` lazily, so a
+plain ``clams-python`` install can still import :mod:`clams.app` and
+:class:`~clams.app.ClamsPromptableApp` without those dependencies; the
+``ImportError`` only fires when an app actually calls
+:func:`clams.backends.hf.load_hf_model`.
diff --git a/pyproject.toml b/pyproject.toml
index 07f1e9c..44fd9de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,9 @@ source = "https://github.com/clamsproject/clams-python"
 dev = ["pytype", "pytest", "pytest-cov", "pillow", "setuptools"]
 docs = ["sphinx>=7.0,<8.0", "furo", "m2r2", "sphinx-jsonschema"]
 test = ["pytype", "pytest", "pytest-cov", "pillow"]
+# Required for apps using the HuggingFace transformers backend
+# (clams.backends.hf). Heavy deps; opt-in only.
+hf = ["torch", "transformers", "pillow"]
 
 [tool.setuptools.packages.find]
 where = ["."]

From ab2a10b8cb3cd95ab8ff03ec839f78c3fcbf69db Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Thu, 28 May 2026 08:03:57 -0400
Subject: [PATCH 04/10] added HF backend handler, added test suites for LLM
 helpers

---
 clams/app/__init__.py             | 154 +++++++++++--
 clams/backends/__init__.py        |   9 +
 clams/backends/hf.py              | 121 ++++++++++
 documentation/app-baseclasses.rst |  15 +-
 tests/test_backends_hf.py         | 248 ++++++++++++++++++++
 tests/test_promptable.py          | 363 ++++++++++++++++++++++++++++++
 6 files changed, 888 insertions(+), 22 deletions(-)
 create mode 100644 clams/backends/__init__.py
 create mode 100644 clams/backends/hf.py
 create mode 100644 tests/test_backends_hf.py
 create mode 100644 tests/test_promptable.py

diff --git a/clams/app/__init__.py b/clams/app/__init__.py
index 94e01e4..027be13 100644
--- a/clams/app/__init__.py
+++ b/clams/app/__init__.py
@@ -816,30 +816,151 @@ def generate(
 
     def build_conversation(
             self,
-            prompt: Union[str, List[str]],
+            prompt: Union[str, List[str], List[dict]],
             system_prompt: str = '',
             images: Optional[List[Any]] = None,
             audio: Optional[List[Any]] = None,
             prompt_mode: str = 'turn-taking',
     ) -> Union[List[dict], List[List[dict]]]:
         """
-        Build a chat-template-compatible message list (or a list of
-        message lists for ``user-only`` mode).
+        Build a chat-template-compatible message list.
+
+        :param prompt: a plain string, a ``List[str]`` of prompt turns,
+            or a pre-built ``List[dict]`` of role/content message
+            objects (returned as-is — pass-through for advanced
+            callers that constructed the conversation themselves).
+        :param system_prompt: if non-empty, prepended as a
+            system-role message.
+        :param images: optional list of image inputs to include in the
+            (final) user turn's content. Each appears as a
+            ``{'type': 'image', 'image': <input>}`` entry.
+        :param audio: optional list of audio inputs to include in the
+            (final) user turn's content. Each appears as a
+            ``{'type': 'audio', 'audio': <input>}`` entry.
+        :param prompt_mode: ``"turn-taking"`` (default) or
+            ``"user-only"``. Only meaningful when ``prompt`` is a
+            multi-element list; ignored otherwise. See
+            :py:attr:`promptable_parameters` for semantics.
+
+        :returns:
+            * For single-shot prompts (string or single-element list)
+              and for multi-element ``turn-taking`` mode: a single
+              ``List[dict]`` of role/content messages, ready to feed
+              to a chat-template applier (e.g.,
+              ``processor.apply_chat_template``).
+            * For multi-element ``user-only`` mode: a
+              ``List[List[dict]]`` of N progressively-extending
+              conversation prefixes, one per user turn. Each prefix
+              ends in a user turn; assistant turns between users are
+              stored with ``content=None`` as placeholders for the
+              caller to fill in with successive generation results.
+
+        Subclasses may override to access model-specific state
+        (``self.processor``, ``self.tokenizer``, etc.) during
+        formatting; the base implementation is back-end-agnostic.
+        """
+        # Pass-through for pre-built message lists.
+        if isinstance(prompt, list) and prompt and all(
+                isinstance(p, dict) for p in prompt):
+            return prompt
+
+        # Normalize to List[str].
+        if isinstance(prompt, str):
+            prompts = [prompt]
+        else:
+            prompts = list(prompt)
+
+        if len(prompts) == 1:
+            return self._build_single_turn(
+                prompts[0], system_prompt, images, audio)
+
+        if prompt_mode == 'turn-taking':
+            return self._build_turn_taking(
+                prompts, system_prompt, images, audio)
+        if prompt_mode == 'user-only':
+            return self._build_user_only(
+                prompts, system_prompt, images, audio)
+        raise ValueError(
+            f"Unknown prompt_mode: {prompt_mode!r}. "
+            f"Expected 'turn-taking' or 'user-only'.")
 
-        Defined as an instance method so subclasses can override and
-        access model-specific state (``self.processor``,
-        ``self.tokenizer``, etc.) when formatting messages.
+    @staticmethod
+    def _make_user_content(text, images=None, audio=None):
+        """Build the content list for a user-role message."""
+        content = []
+        if images:
+            for img in images:
+                content.append({'type': 'image', 'image': img})
+        if audio:
+            for au in audio:
+                content.append({'type': 'audio', 'audio': au})
+        content.append({'type': 'text', 'text': text})
+        return content
+
+    def _build_single_turn(self, text, system_prompt, images, audio):
+        messages = []
+        if system_prompt:
+            messages.append({'role': 'system', 'content': system_prompt})
+        messages.append({
+            'role': 'user',
+            'content': self._make_user_content(text, images, audio),
+        })
+        return messages
+
+    def _build_turn_taking(self, prompts, system_prompt, images, audio):
+        """
+        Alternating user/assistant turns; one inference call.
+        Even indices in ``prompts`` are user turns, odd indices are
+        pre-written assistant exemplars. Images/audio (if any) are
+        attached to the final user turn (the actual query).
+        """
+        messages = []
+        if system_prompt:
+            messages.append({'role': 'system', 'content': system_prompt})
+        # index of the final user turn (the last even index)
+        last_user_idx = (len(prompts) - 1) - ((len(prompts) - 1) % 2)
+        for i, text in enumerate(prompts):
+            role = 'user' if i % 2 == 0 else 'assistant'
+            if role == 'user':
+                attach_media = (i == last_user_idx)
+                content = self._make_user_content(
+                    text,
+                    images if attach_media else None,
+                    audio if attach_media else None,
+                )
+                messages.append({'role': 'user', 'content': content})
+            else:
+                messages.append({'role': 'assistant', 'content': text})
+        return messages
 
-        .. note::
-           The base implementation is a stub in this release and raises
-           :py:class:`NotImplementedError`. Subclasses must override.
-           A default implementation will be added in a follow-up.
+    def _build_user_only(self, prompts, system_prompt, images, audio):
         """
-        raise NotImplementedError(
-            "ClamsPromptableApp.build_conversation() is a stub in this "
-            "release. Override it in your subclass, or wait for the base "
-            "implementation in a follow-up."
-        )
+        N progressively-extending conversation prefixes, one per user
+        turn. Assistant slots between users have ``content=None`` as
+        placeholders for the caller's successive generation results.
+        """
+        convs: List[List[dict]] = []
+        base: List[dict] = []
+        if system_prompt:
+            base.append({'role': 'system', 'content': system_prompt})
+        for i, text in enumerate(prompts):
+            # First user turn carries the images/audio (the initial query);
+            # subsequent user turns are text-only.
+            user_content = self._make_user_content(
+                text,
+                images if i == 0 else None,
+                audio if i == 0 else None,
+            )
+            base.append({'role': 'user', 'content': user_content})
+            # Snapshot the conversation as it stands at the start of
+            # the i-th generation call. Shallow-copy each message so
+            # later in-place edits (e.g., filling in the assistant
+            # placeholder) don't retroactively mutate earlier
+            # snapshots.
+            convs.append([dict(m) for m in base])
+            if i < len(prompts) - 1:
+                base.append({'role': 'assistant', 'content': None})
+        return convs
 
     def store_response(
             self,
@@ -872,7 +993,8 @@ def store_response(
         td = view.new_textdocument(text=answer)
         align = view.new_annotation(
             AnnotationTypes.Alignment,
-            properties={'source': source, 'target': td.long_id},
+            source=source,
+            target=td.id,
         )
         if trace is not None:
             raise NotImplementedError(
diff --git a/clams/backends/__init__.py b/clams/backends/__init__.py
new file mode 100644
index 0000000..d9fe452
--- /dev/null
+++ b/clams/backends/__init__.py
@@ -0,0 +1,9 @@
+"""
+Optional model-backend helpers for CLAMS apps.
+
+Each backend is a separate submodule. Heavy dependencies (e.g.,
+``torch``, ``transformers``) are NOT pulled in by the base
+``clams-python`` install; users opt in via pip extras such as
+``pip install clams-python[hf]`` for the HuggingFace transformers
+backend.
+"""
diff --git a/clams/backends/hf.py b/clams/backends/hf.py
new file mode 100644
index 0000000..ca0420e
--- /dev/null
+++ b/clams/backends/hf.py
@@ -0,0 +1,121 @@
+"""
+HuggingFace transformers backend helper.
+
+Provides :func:`load_hf_model`, a general loader that wraps the device,
+processor, dtype, and inference-mode boilerplate every HF-backed CLAMS
+app does identically. Usable for any model class that supports
+``from_pretrained()`` — instruction-tuned LLMs/VLMs, encoder-only
+classifiers, vision/audio feature extractors, etc.
+
+``torch`` and ``transformers`` are optional dependencies. Install them
+via the ``[hf]`` extra::
+
+    pip install clams-python[hf]
+
+Imports are lazy: this module can be referenced from
+:mod:`clams.app` without triggering an ``ImportError`` on a base
+``clams-python`` install. The :class:`ImportError` only fires when
+:func:`load_hf_model` is actually called without the extras.
+"""
+from typing import Any, Optional, Tuple
+
+
+def load_hf_model(
+        model_id: str,
+        model_cls,
+        processor_cls=None,
+        dtype=None,
+        device: Optional[str] = None,
+        padding_side: Optional[str] = None,
+        model_kwargs: Optional[dict] = None,
+        processor_kwargs: Optional[dict] = None,
+) -> Tuple[Any, Any, str]:
+    """
+    Load a HuggingFace ``transformers`` model via ``from_pretrained``
+    and return it ready for inference.
+
+    :param model_id: HuggingFace model identifier (e.g., a Hub repo
+        name or a local path) forwarded to ``from_pretrained``.
+    :param model_cls: a ``transformers`` model class (e.g.,
+        ``AutoModelForCausalLM``, ``AutoModelForImageTextToText``,
+        ``ConvNextV2Model``, ``ViTModel``, ...). Whatever supports
+        ``from_pretrained()``.
+    :param processor_cls: a processor / tokenizer / feature-extractor
+        class with ``from_pretrained()``. Defaults to
+        ``transformers.AutoProcessor``. Pass ``transformers.AutoTokenizer``,
+        ``transformers.AutoImageProcessor``, etc. for narrower cases.
+        Pass ``None`` explicitly to skip processor loading entirely
+        (the returned ``processor`` in that case is ``None``).
+    :param dtype: torch dtype for the model (e.g., ``torch.bfloat16``).
+        When ``None`` (default), no ``torch_dtype`` kwarg is forwarded
+        to ``from_pretrained`` -- the model class uses its own default
+        (typically float32). Set explicitly for low-precision LLM
+        inference.
+    :param device: target device string (e.g., ``'cuda'``, ``'cpu'``,
+        ``'cuda:0'``). When ``None`` (default), the helper auto-detects
+        cuda availability and falls back to cpu.
+    :param padding_side: if set (typically ``'left'`` for decoder-only
+        models doing batched generation), the helper configures the
+        underlying tokenizer's ``padding_side`` and -- when no pad
+        token is set -- uses the EOS token as the pad token. Leave
+        ``None`` for encoder / non-batched cases (the tokenizer's own
+        default is preserved).
+    :param model_kwargs: extra kwargs forwarded to
+        ``model_cls.from_pretrained()`` (e.g.,
+        ``{'use_safetensors': True, 'add_pooling_layer': False}``).
+    :param processor_kwargs: extra kwargs forwarded to
+        ``processor_cls.from_pretrained()`` (e.g.,
+        ``{'use_safetensors': True, 'use_fast': True}``).
+
+    :returns: ``(processor, model, device)`` tuple. ``processor`` is
+        the loaded processor/tokenizer/feature-extractor (or ``None``
+        if ``processor_cls`` was explicitly set to ``None``).
+        ``device`` is the resolved device string the model was moved
+        to.
+    :rtype: Tuple[Any, Any, str]
+    :raises ImportError: if ``torch`` or ``transformers`` is not
+        installed. Install the ``[hf]`` extra to fix.
+    """
+    try:
+        import torch  # pytype: disable=import-error
+    except ImportError as e:
+        raise ImportError(
+            "clams.backends.hf requires the `torch` package. "
+            "Install with: pip install clams-python[hf]"
+        ) from e
+    try:
+        import transformers  # pytype: disable=import-error
+    except ImportError as e:
+        raise ImportError(
+            "clams.backends.hf requires the `transformers` package. "
+            "Install with: pip install clams-python[hf]"
+        ) from e
+
+    resolved_device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+
+    # Processor.
+    if processor_cls is None and processor_kwargs is None:
+        # default to AutoProcessor
+        processor_cls = transformers.AutoProcessor
+    if processor_cls is not None:
+        processor = processor_cls.from_pretrained(
+            model_id, **(processor_kwargs or {}))
+        if padding_side is not None:
+            tokenizer = getattr(processor, 'tokenizer', processor)
+            tokenizer.padding_side = padding_side
+            if getattr(tokenizer, 'pad_token', None) is None:
+                eos = getattr(tokenizer, 'eos_token', None)
+                if eos is not None:
+                    tokenizer.pad_token = eos
+    else:
+        processor = None
+
+    # Model.
+    model_load_kwargs = dict(model_kwargs or {})
+    if dtype is not None:
+        model_load_kwargs['torch_dtype'] = dtype
+    model = model_cls.from_pretrained(model_id, **model_load_kwargs)
+    model = model.to(resolved_device)
+    model.eval()
+
+    return processor, model, resolved_device
diff --git a/documentation/app-baseclasses.rst b/documentation/app-baseclasses.rst
index 488fa11..44d9ec3 100644
--- a/documentation/app-baseclasses.rst
+++ b/documentation/app-baseclasses.rst
@@ -327,15 +327,18 @@ Helpers
     ``metadata.py``) to add the SDK-managed promptable parameters.
 
 :meth:`~clams.app.ClamsPromptableApp.build_conversation`
-    Instance method that constructs a chat-template-compatible message list
-    (or list of message lists for ``user-only`` mode). Subclasses may
-    override to access model-specific state (e.g. ``self.processor``).
-    Currently a stub; a default implementation is planned for a follow-up
-    release.
+    Instance method that constructs a chat-template-compatible message
+    list (or a ``List[List[dict]]`` of progressively-extending prefixes
+    for ``user-only`` mode). Handles string and list prompt forms, the
+    two ``promptMode`` semantics, the optional ``systemPrompt``, and
+    inlines ``images`` / ``audio`` into the (final) user turn. Accepts
+    a pre-built ``List[dict]`` and returns it unchanged. Subclasses
+    may override to access model-specific state (e.g.
+    ``self.processor``) when formatting messages.
 
 :meth:`~clams.app.ClamsPromptableApp.store_response`
     Helper for the common annotation-creation pattern: given a view, a
-    source annotation's ``long_id``, and a generated string, creates a
+    source annotation's identifier, and a generated string, creates a
     ``TextDocument`` containing the text plus an ``Alignment`` linking
     source to TextDocument; returns the ``(text_document, alignment)``
     pair. The optional ``trace`` parameter is reserved for
diff --git a/tests/test_backends_hf.py b/tests/test_backends_hf.py
new file mode 100644
index 0000000..d6ddb2e
--- /dev/null
+++ b/tests/test_backends_hf.py
@@ -0,0 +1,248 @@
+"""
+Tests for :func:`clams.backends.hf.load_hf_model`.
+
+Exercises the device / dtype / padding-side / kwargs-passthrough
+behavior of the helper against mocked ``transformers`` model and
+processor classes.
+
+If ``torch`` is not installed, the whole file is skipped (it is an
+optional dep behind the ``[hf]`` extra).
+"""
+import unittest
+
+import pytest
+
+pytest.importorskip('torch')
+
+from clams.backends.hf import load_hf_model  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Mocks
+# ---------------------------------------------------------------------------
+
+class _MockModel:
+    """Stand-in for a ``transformers`` model class."""
+
+    # cross-test state — each test should set this to None first
+    last_from_pretrained_args = None
+    last_from_pretrained_kwargs = None
+
+    @classmethod
+    def from_pretrained(cls, model_id, **kwargs):
+        cls.last_from_pretrained_args = (model_id,)
+        cls.last_from_pretrained_kwargs = dict(kwargs)
+        return cls()
+
+    def __init__(self):
+        self.device = None
+        self.eval_called = False
+
+    def to(self, device):
+        self.device = device
+        return self
+
+    def eval(self):
+        self.eval_called = True
+        return self
+
+
+class _MockTokenizer:
+    def __init__(self):
+        self.padding_side = 'right'
+        self.pad_token = None
+        self.eos_token = '<eos>'
+
+
+class _MockProcessor:
+    """Stand-in for ``AutoProcessor`` (or similar)."""
+
+    last_from_pretrained_args = None
+    last_from_pretrained_kwargs = None
+
+    @classmethod
+    def from_pretrained(cls, model_id, **kwargs):
+        cls.last_from_pretrained_args = (model_id,)
+        cls.last_from_pretrained_kwargs = dict(kwargs)
+        return cls()
+
+    def __init__(self):
+        self.tokenizer = _MockTokenizer()
+
+
+# ---------------------------------------------------------------------------
+# Test cases
+# ---------------------------------------------------------------------------
+
+class TestDefaultsOnly(unittest.TestCase):
+    """
+    Case (a): caller passes only ``model_id`` + ``model_cls``.
+    No dtype, no padding_side, no extra kwargs.
+    """
+
+    def setUp(self):
+        _MockModel.last_from_pretrained_args = None
+        _MockModel.last_from_pretrained_kwargs = None
+        _MockProcessor.last_from_pretrained_args = None
+        _MockProcessor.last_from_pretrained_kwargs = None
+
+    def test_returns_processor_model_device_tuple(self):
+        result = load_hf_model(
+            'fake-model-id', _MockModel, processor_cls=_MockProcessor)
+        self.assertEqual(len(result), 3)
+        processor, model, device = result
+        self.assertIsInstance(processor, _MockProcessor)
+        self.assertIsInstance(model, _MockModel)
+        self.assertIsInstance(device, str)
+        # cpu or cuda depending on host — must be one of them
+        self.assertIn(device, ('cpu', 'cuda'))
+
+    def test_no_torch_dtype_passed_when_dtype_is_none(self):
+        load_hf_model(
+            'fake-model-id', _MockModel, processor_cls=_MockProcessor)
+        # When dtype is None, helper should NOT inject torch_dtype into
+        # model_cls.from_pretrained (let the model class use its own
+        # default).
+        kwargs = _MockModel.last_from_pretrained_kwargs
+        self.assertNotIn('torch_dtype', kwargs)
+
+    def test_padding_side_untouched_when_not_requested(self):
+        processor, _, _ = load_hf_model(
+            'fake-model-id', _MockModel, processor_cls=_MockProcessor)
+        # Default 'right' should persist; helper should NOT have
+        # rewritten it.
+        self.assertEqual(processor.tokenizer.padding_side, 'right')
+        # pad_token should NOT have been forced to EOS.
+        self.assertIsNone(processor.tokenizer.pad_token)
+
+    def test_model_put_in_eval_mode(self):
+        _, model, _ = load_hf_model(
+            'fake-model-id', _MockModel, processor_cls=_MockProcessor)
+        self.assertTrue(model.eval_called)
+
+
+class TestDecoderOnlyMode(unittest.TestCase):
+    """
+    Case (b): caller passes ``padding_side='left'`` (decoder-only
+    batched generation) and an explicit ``dtype``.
+    """
+
+    def setUp(self):
+        _MockModel.last_from_pretrained_args = None
+        _MockModel.last_from_pretrained_kwargs = None
+        _MockProcessor.last_from_pretrained_args = None
+        _MockProcessor.last_from_pretrained_kwargs = None
+
+    def test_padding_side_set_to_left_on_tokenizer(self):
+        processor, _, _ = load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            padding_side='left',
+        )
+        self.assertEqual(processor.tokenizer.padding_side, 'left')
+
+    def test_pad_token_set_from_eos_when_unset(self):
+        processor, _, _ = load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            padding_side='left',
+        )
+        self.assertEqual(
+            processor.tokenizer.pad_token,
+            processor.tokenizer.eos_token,
+        )
+
+    def test_dtype_forwarded_as_torch_dtype(self):
+        import torch
+        load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            dtype=torch.bfloat16,
+            padding_side='left',
+        )
+        self.assertEqual(
+            _MockModel.last_from_pretrained_kwargs.get('torch_dtype'),
+            torch.bfloat16,
+        )
+
+
+class TestKwargsPassThrough(unittest.TestCase):
+    """
+    Case (c): ``model_kwargs`` and ``processor_kwargs`` reach the
+    respective ``from_pretrained`` calls. Validates the SWT-style
+    pattern (use_safetensors, use_fast, add_pooling_layer, etc.).
+    """
+
+    def setUp(self):
+        _MockModel.last_from_pretrained_args = None
+        _MockModel.last_from_pretrained_kwargs = None
+        _MockProcessor.last_from_pretrained_args = None
+        _MockProcessor.last_from_pretrained_kwargs = None
+
+    def test_model_kwargs_reach_from_pretrained(self):
+        load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            model_kwargs={'use_safetensors': True,
+                          'add_pooling_layer': False},
+        )
+        kw = _MockModel.last_from_pretrained_kwargs
+        self.assertTrue(kw.get('use_safetensors'))
+        self.assertFalse(kw.get('add_pooling_layer'))
+
+    def test_processor_kwargs_reach_from_pretrained(self):
+        load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            processor_kwargs={'use_safetensors': True, 'use_fast': True},
+        )
+        kw = _MockProcessor.last_from_pretrained_kwargs
+        self.assertTrue(kw.get('use_safetensors'))
+        self.assertTrue(kw.get('use_fast'))
+
+    def test_model_id_arrives_first_positional(self):
+        load_hf_model(
+            'fake-model-id', _MockModel, processor_cls=_MockProcessor)
+        self.assertEqual(
+            _MockModel.last_from_pretrained_args, ('fake-model-id',))
+        self.assertEqual(
+            _MockProcessor.last_from_pretrained_args, ('fake-model-id',))
+
+    def test_model_and_processor_kwargs_do_not_cross_contaminate(self):
+        """SWT mixes incompatible kwargs across model and processor;
+        ensure helper doesn't blindly merge them."""
+        load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            model_kwargs={'add_pooling_layer': False},
+            processor_kwargs={'use_fast': True},
+        )
+        # add_pooling_layer is model-only; should NOT reach processor
+        self.assertNotIn(
+            'add_pooling_layer',
+            _MockProcessor.last_from_pretrained_kwargs)
+        # use_fast is processor-only; should NOT reach model
+        self.assertNotIn(
+            'use_fast',
+            _MockModel.last_from_pretrained_kwargs)
+
+
+class TestDeviceResolution(unittest.TestCase):
+    """The helper auto-detects cuda/cpu when device is None."""
+
+    def setUp(self):
+        _MockModel.last_from_pretrained_args = None
+        _MockModel.last_from_pretrained_kwargs = None
+
+    def test_explicit_device_honored(self):
+        _, model, device = load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            device='cpu',
+        )
+        self.assertEqual(device, 'cpu')
+        self.assertEqual(model.device, 'cpu')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_promptable.py b/tests/test_promptable.py
new file mode 100644
index 0000000..9311f21
--- /dev/null
+++ b/tests/test_promptable.py
@@ -0,0 +1,363 @@
+"""
+Tests for :class:`clams.app.ClamsPromptableApp`.
+
+Covers the behavior documented in
+``documentation/app-baseclasses.rst``: parameter discovery via
+``inject_promptable_parameters()``, the reservation rule on
+promptable-param names, ``build_conversation()`` shape across the
+single-turn / turn-taking / user-only modes, and the
+``response_to_grounded_textdocument()`` output contract.
+"""
+import unittest
+
+from mmif import AnnotationTypes, DocumentTypes, Mmif
+
+from clams import AppMetadata, ClamsPromptableApp
+
+
+# ---------------------------------------------------------------------------
+# Test infrastructure
+# ---------------------------------------------------------------------------
+
+def make_metadata(call_helper=True, pre_declare=None):
+    """
+    Build a fresh AppMetadata for tests.
+
+    :param call_helper: if True, calls
+        ``ClamsPromptableApp.inject_promptable_parameters(metadata)``
+        at the end (simulating a correctly-written ``appmetadata()``).
+    :param pre_declare: if set to a parameter spec dict, calls
+        ``metadata.add_parameter(**pre_declare)`` BEFORE the helper
+        runs — used to test reservation enforcement.
+    """
+    m = AppMetadata(
+        name="Example Promptable App",
+        description="Test fixture, creating input TD - output TD alignment",
+        app_license="MIT",
+        identifier="https://apps.clams.ai/example-promptable/v1",
+        url="https://fakegithub.com/some/repository",
+    )
+    m.add_input(DocumentTypes.TextDocument)
+    m.add_output(DocumentTypes.TextDocument)
+    m.add_output(AnnotationTypes.Alignment)
+    if pre_declare is not None:
+        m.add_parameter(**pre_declare)
+    if call_helper:
+        ClamsPromptableApp.inject_promptable_parameters(m)
+    return m
+
+
+def make_test_app(metadata):
+    """
+    Factory creating a fresh ClamsPromptableApp subclass that loads the
+    given metadata. Each call produces a fresh class so per-test state
+    doesn't leak.
+    """
+
+    def _load_appmetadata(self):
+        return metadata
+
+    cls = type(
+        'TestPromptableApp',
+        (ClamsPromptableApp,),
+        {
+            '_load_appmetadata': _load_appmetadata,
+            '_appmetadata': lambda self: None,
+            '_annotate': lambda self, mmif, **kw: mmif,
+            'generate': lambda self, prompt, **kw: [""],
+        },
+    )
+    return cls()
+
+
+# ---------------------------------------------------------------------------
+# Parameter discovery (via the helper)
+# ---------------------------------------------------------------------------
+
+class TestParameterDiscovery(unittest.TestCase):
+
+    def test_all_promptable_params_present_after_init(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        present = {p.name for p in app.metadata.parameters}
+        expected_promptable = {p['name']
+                               for p in ClamsPromptableApp.promptable_parameters}
+        self.assertTrue(expected_promptable.issubset(present))
+
+    def test_prompt_has_no_sdk_default(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        prompt_param = next(p for p in app.metadata.parameters
+                            if p.name == 'prompt')
+        self.assertIsNone(prompt_param.default)
+        self.assertTrue(prompt_param.multivalued)
+
+    def test_system_prompt_default_empty_string(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        sysprompt = next(p for p in app.metadata.parameters
+                         if p.name == 'systemPrompt')
+        self.assertEqual(sysprompt.default, '')
+
+    def test_temperature_default_is_zero(self):
+        """When the caller omits ``temperature``, it should arrive in
+        ``_annotate()`` as the float ``0.0`` (deterministic decoding)."""
+        app = make_test_app(make_metadata(call_helper=True))
+        refined = app._refine_params(prompt=['hi'])
+        self.assertEqual(refined['temperature'], 0.0)
+        self.assertIsInstance(refined['temperature'], float)
+
+    def test_prompt_mode_choices(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        pm = next(p for p in app.metadata.parameters
+                  if p.name == 'promptMode')
+        self.assertEqual(set(pm.choices), {'user-only', 'turn-taking'})
+        self.assertEqual(pm.default, 'turn-taking')
+
+
+# ---------------------------------------------------------------------------
+# Required-prompt validation
+# ---------------------------------------------------------------------------
+
+class TestRequiredPrompt(unittest.TestCase):
+
+    def test_refine_params_raises_when_prompt_missing(self):
+        """
+        ``prompt`` has no SDK default. ``_refine_params`` must raise
+        ``ValueError`` when the caller omits it.
+        """
+        app = make_test_app(make_metadata(call_helper=True))
+        with self.assertRaises(ValueError) as ctx:
+            app._refine_params()
+        self.assertIn('prompt', str(ctx.exception))
+
+
+# ---------------------------------------------------------------------------
+# Missing-helper validation in __init__
+# ---------------------------------------------------------------------------
+
+class TestMissingHelperValidation(unittest.TestCase):
+
+    def test_init_raises_when_helper_not_called(self):
+        """
+        If ``appmetadata()`` forgets to call
+        ``inject_promptable_parameters()``, ``__init__`` must raise
+        ``ValueError`` with an instructive message.
+        """
+        with self.assertRaises(ValueError) as ctx:
+            make_test_app(make_metadata(call_helper=False))
+        msg = str(ctx.exception)
+        self.assertIn('inject_promptable_parameters', msg)
+
+
+# ---------------------------------------------------------------------------
+# Reservation enforcement (via duplicate-name ValueError)
+# ---------------------------------------------------------------------------
+
+class TestReservationEnforcement(unittest.TestCase):
+
+    def test_redeclaring_prompt_trips_duplicate_name_error(self):
+        """
+        An app that calls ``metadata.add_parameter('prompt', ...)``
+        before the helper trips the existing duplicate-name
+        ``ValueError`` from ``AppMetadata.add_parameter`` (which the
+        helper's own ``add_parameter`` call raises).
+        """
+        with self.assertRaises(ValueError) as ctx:
+            make_metadata(
+                call_helper=True,
+                pre_declare={
+                    'name': 'prompt',
+                    'description': 'app-defined collision',
+                    'type': 'string',
+                    'multivalued': True,
+                },
+            )
+        self.assertIn("'prompt'", str(ctx.exception))
+
+    def test_redeclaring_max_new_tokens_trips_error(self):
+        with self.assertRaises(ValueError) as ctx:
+            make_metadata(
+                call_helper=True,
+                pre_declare={
+                    'name': 'maxNewTokens',
+                    'description': 'app-defined collision',
+                    'type': 'integer',
+                    'default': 1024,
+                },
+            )
+        self.assertIn("'maxNewTokens'", str(ctx.exception))
+
+
+# ---------------------------------------------------------------------------
+# annotate_param_caster covers promptable params (no stale-spec drift)
+# ---------------------------------------------------------------------------
+
+class TestAnnotateParamCaster(unittest.TestCase):
+
+    def test_caster_includes_promptable_param_specs(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        for spec in ClamsPromptableApp.promptable_parameters:
+            self.assertIn(spec['name'], app.annotate_param_spec)
+            stored_type, stored_multivalued = \
+                app.annotate_param_spec[spec['name']]
+            self.assertEqual(stored_type, spec['type'])
+            self.assertEqual(
+                stored_multivalued, spec.get('multivalued', False))
+
+    def test_multivalued_prompt_casts_to_list_of_strings(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        refined = app._refine_params(prompt=['hello', 'world'])
+        self.assertEqual(refined['prompt'], ['hello', 'world'])
+
+    def test_max_new_tokens_casts_to_int(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        refined = app._refine_params(prompt=['hi'], maxNewTokens=['1024'])
+        self.assertEqual(refined['maxNewTokens'], 1024)
+        self.assertIsInstance(refined['maxNewTokens'], int)
+
+    def test_temperature_casts_to_float(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        refined = app._refine_params(prompt=['hi'], temperature=['0.7'])
+        self.assertEqual(refined['temperature'], 0.7)
+        self.assertIsInstance(refined['temperature'], float)
+
+
+# ---------------------------------------------------------------------------
+# build_conversation
+# ---------------------------------------------------------------------------
+
+class TestBuildConversation(unittest.TestCase):
+    """
+    Covers the shape of ``ClamsPromptableApp.build_conversation()``
+    across single-turn, turn-taking, and user-only modes, and the
+    pre-built-message pass-through case.
+    """
+
+    def setUp(self):
+        self.app = make_test_app(make_metadata(call_helper=True))
+
+    def test_string_prompt_single_user_turn(self):
+        conv = self.app.build_conversation(prompt="hello")
+        self.assertEqual(len(conv), 1)
+        self.assertEqual(conv[0]['role'], 'user')
+
+    def test_single_element_list_single_user_turn(self):
+        conv = self.app.build_conversation(prompt=['hello'])
+        self.assertEqual(len(conv), 1)
+        self.assertEqual(conv[0]['role'], 'user')
+
+    def test_turn_taking_alternating_turns(self):
+        conv = self.app.build_conversation(
+            prompt=['q1', 'a1', 'q2'], prompt_mode='turn-taking')
+        self.assertEqual(len(conv), 3)
+        self.assertEqual(conv[0]['role'], 'user')
+        self.assertEqual(conv[1]['role'], 'assistant')
+        self.assertEqual(conv[2]['role'], 'user')
+
+    def test_user_only_returns_progressively_extending_lists(self):
+        convs = self.app.build_conversation(
+            prompt=['q1', 'q2', 'q3'], prompt_mode='user-only')
+        # N progressively-extending message lists, one per turn
+        self.assertEqual(len(convs), 3)
+        # last conversation has all 3 user turns (+ intermediate
+        # assistant turns once the model has filled them in; at
+        # build_conversation time the assistants are placeholders or
+        # empty — the test pins length, not exact content)
+        self.assertGreaterEqual(len(convs[-1]), 3)
+
+    def test_pre_built_list_pass_through(self):
+        msgs = [
+            {'role': 'system', 'content': 'You are helpful.'},
+            {'role': 'user', 'content': 'hi'},
+        ]
+        conv = self.app.build_conversation(prompt=msgs)
+        self.assertEqual(conv, msgs)
+
+    def test_system_prompt_prepended(self):
+        conv = self.app.build_conversation(
+            prompt='hello', system_prompt='You are helpful.')
+        # first turn is a system message
+        self.assertEqual(conv[0]['role'], 'system')
+
+    def test_images_carried_in_user_content(self):
+        sentinel = object()
+        conv = self.app.build_conversation(
+            prompt='describe this', images=[sentinel])
+        # the sentinel image should appear somewhere in the first
+        # user-turn content
+        user_turn = next(m for m in conv if m['role'] == 'user')
+        # content is typically a list of dicts; flatten to a sequence
+        # of values and check for the sentinel
+        flat = []
+
+        def _walk(x):
+            if isinstance(x, dict):
+                for v in x.values():
+                    _walk(v)
+            elif isinstance(x, list):
+                for v in x:
+                    _walk(v)
+            else:
+                flat.append(x)
+
+        _walk(user_turn['content'])
+        self.assertIn(sentinel, flat)
+
+
+# ---------------------------------------------------------------------------
+# store_response
+# ---------------------------------------------------------------------------
+
+class TestStoreResponse(unittest.TestCase):
+
+    def setUp(self):
+        self.app = make_test_app(make_metadata(call_helper=True))
+        self.mmif = Mmif(validate=False)
+        self.view = self.mmif.new_view()
+        self.app.sign_view(self.view, {})
+        self.view.new_contain(DocumentTypes.TextDocument)
+        self.view.new_contain(AnnotationTypes.Alignment)
+
+    def test_happy_path_creates_textdocument_and_alignment(self):
+        td, align = self.app.store_response(
+            self.view, source='src1', answer='generated text')
+        self.assertEqual(td.text_value, 'generated text')
+        self.assertEqual(align.get_property('source'), 'src1')
+        self.assertEqual(align.get_property('target'), td.id)
+
+    def test_trace_none_does_not_raise(self):
+        # no exception
+        self.app.store_response(
+            self.view, source='src1', answer='text', trace=None)
+
+    def test_trace_not_none_raises_not_implemented(self):
+        with self.assertRaises(NotImplementedError):
+            self.app.store_response(
+                self.view, source='src1', answer='text',
+                trace='intermediate reasoning')
+
+
+# ---------------------------------------------------------------------------
+# Transport-neutral parameter casting
+# ---------------------------------------------------------------------------
+
+class TestTransportNeutralCasting(unittest.TestCase):
+    """
+    Just exercises the standard ``ClamsApp`` parameter-casting path.
+    Not envelope-specific; the point is that promptable apps see no
+    separate transport layer.
+    """
+
+    def test_multi_element_prompt_arrives_as_list_of_strings(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        refined = app._refine_params(prompt=['a', 'b', 'c'])
+        self.assertEqual(refined['prompt'], ['a', 'b', 'c'])
+        for x in refined['prompt']:
+            self.assertIsInstance(x, str)
+
+    def test_single_element_prompt_still_list(self):
+        app = make_test_app(make_metadata(call_helper=True))
+        refined = app._refine_params(prompt=['only'])
+        self.assertEqual(refined['prompt'], ['only'])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2267d4367a298ac1b73288922f23675724b0f58b Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Thu, 28 May 2026 08:07:23 -0400
Subject: [PATCH 05/10] small clarification in naming and docstring

---
 clams/app/__init__.py             | 115 ++++++++++++++++++++++--------
 documentation/app-baseclasses.rst |  35 +++++----
 documentation/runtime-params.rst  |   2 +-
 tests/test_promptable.py          |  47 +++++++++---
 4 files changed, 143 insertions(+), 56 deletions(-)

diff --git a/clams/app/__init__.py b/clams/app/__init__.py
index 027be13..b004eea 100644
--- a/clams/app/__init__.py
+++ b/clams/app/__init__.py
@@ -11,7 +11,7 @@
 
 __all__ = ['ClamsApp', 'ClamsPromptableApp']
 
-from typing import Union, Any, Optional, Dict, List, Tuple
+from typing import Union, Any, Optional, Dict, List, Tuple, cast
 
 from mmif import Mmif, Document, DocumentTypes, View, AnnotationTypes
 from mmif.utils.video_document_helper import (
@@ -116,7 +116,7 @@ def appmetadata(self, **kwargs: List[str]) -> str:
         """
         # cast only, no refinement
         casted = self.metadata_param_caster.cast(kwargs)
-        pretty = casted.pop('pretty') if 'pretty' in casted else False
+        pretty = casted.get('pretty', False)
         return self.metadata.jsonify(pretty)
     
     def _load_appmetadata(self) -> AppMetadata:
@@ -185,7 +185,7 @@ def annotate(self, mmif: Union[str, dict, Mmif], **runtime_params: List[str]) ->
         refined = self._refine_params(**runtime_params)
         self.logger.debug(f"Refined parameters: {refined}")
         pretty = refined.get('pretty', False)
-        sampling_mode_str = refined.pop('tfSamplingMode', None)
+        sampling_mode_str = refined.get('tfSamplingMode', None)
         if sampling_mode_str is not None:
             _sampling_mode.set(SamplingMode(sampling_mode_str))
         t = datetime.now()
@@ -639,6 +639,11 @@ def open_document_location(document: Union[str, Document], opener: Any = open, *
                     raise FileNotFoundError(p.path)
 
 
+# TODO (krim @ 05/28/26): maybe we should consider implementing
+# autodoc-based auto documentation export (e.g., ``automethod`` for
+# methods and a small Sphinx extension to render
+# ``promptable_parameters`` into the parameter table), instead of the
+# current hand-authored ``documentation/app-baseclasses.rst``. 
 class ClamsPromptableApp(ClamsApp):
     """
     Base class for CLAMS apps that wrap a promptable model (an LLM or
@@ -664,8 +669,9 @@ class ClamsPromptableApp(ClamsApp):
       parameter set to ``AppMetadata``
     * :py:meth:`build_conversation` — assemble a chat-template-compatible
       message list (stub in this release)
-    * :py:meth:`store_response` — persist a generated response into a
-      view as ``TextDocument`` + ``Alignment``
+    * :py:meth:`response_to_grounded_textdocument` — persist a
+      generated response into a view as ``TextDocument`` +
+      ``Alignment`` (+ optional ``origins`` / ``origination``)
     """
 
     #: SDK-managed runtime parameters injected into every promptable app.
@@ -724,12 +730,22 @@ class ClamsPromptableApp(ClamsApp):
                 'is greater than 0.',
         },
         {
-            'name': 'batchSize', 'type': 'integer', 'default': 1,
+            'name': 'parallelPrompts', 'type': 'integer', 'default': 1,
             'description':
-                'How many input items the app groups per ``generate`` call. '
-                'GPU memory scales roughly linearly with batch size; raise '
-                'for throughput on GPUs with headroom, keep at ``1`` on '
-                'memory-tight setups.',
+                'Number of independent prompts the app runs in parallel '
+                '(stacks into a single forward pass). The *size* of each '
+                'prompt (how many images, how long the system/user text '
+                'is, etc.) is NOT regulated by this parameter; that is '
+                'each app\'s responsibility. Prompt count and per-prompt '
+                'content size combine multiplicatively for GPU memory, '
+                'so the two can blow up together. Catastrophic example: '
+                '``tfSamplingMode=all`` on a TimeFrame without '
+                '``targets`` expands that TF into one image per '
+                'native-FPS frame (300 images for a 10-second TF at '
+                '30fps); ``parallelPrompts=4`` then runs 4 such prompts '
+                'in one forward pass (~1200 images), guaranteed OOM. '
+                'Keep at ``1`` on memory-tight setups; raise only when '
+                'per-prompt content is small and bounded.',
         },
     ]
 
@@ -782,7 +798,7 @@ def generate(
             images: Optional[List[Any]] = None,
             audio: Optional[List[Any]] = None,
             prompt_mode: str = 'turn-taking',
-            batch_size: int = 1,
+            parallel_prompts: int = 1,
             **generation_params,
     ) -> List[str]:
         """
@@ -804,8 +820,8 @@ def generate(
         :param audio: optional list of input audio clips
         :param prompt_mode: ``"turn-taking"`` (default) or ``"user-only"``;
             see :py:attr:`promptable_parameters`
-        :param batch_size: max number of items per underlying
-            ``generate`` call
+        :param parallel_prompts: max number of independent prompts the
+            underlying call stacks into one forward pass
         :param generation_params: any additional backend-specific
             generation kwargs (``maxNewTokens``, ``temperature``,
             ``topP``, ``topK``, etc.)
@@ -862,7 +878,7 @@ def build_conversation(
         # Pass-through for pre-built message lists.
         if isinstance(prompt, list) and prompt and all(
                 isinstance(p, dict) for p in prompt):
-            return prompt
+            return cast(List[dict], prompt)
 
         # Normalize to List[str].
         if isinstance(prompt, str):
@@ -962,41 +978,78 @@ def _build_user_only(self, prompts, system_prompt, images, audio):
                 base.append({'role': 'assistant', 'content': None})
         return convs
 
-    def store_response(
+    def response_to_grounded_textdocument(
             self,
             view: View,
             source: str,
-            answer: str,
-            trace: Optional[str] = None,
+            response: str,
+            origins: Optional[List[str]] = None,
+            origination: Optional[str] = None,
+            reasoning_trace: Optional[str] = None,
     ) -> Tuple[Any, Any]:
         """
-        Persist a generated response into a view as a
-        ``TextDocument`` (containing ``answer``) plus an
-        ``Alignment`` linking ``source`` to the new TextDocument.
+        Persist a single LLM text response into a view. Writes one
+        ``TextDocument`` (containing the response) plus possible
+        grounding via an ``Alignment`` annotation and ``origins`` / 
+        ``origination`` properties on the TD.
+
+        The two grounding link kinds are semantically distinct:
+
+        * ``source`` is the *coarse* cross-modal grounding -- the
+          single annotation id that the response is anchored to.
+          Written into the new ``Alignment`` (``source -> td``).
+          Typical value: the parent ``TimeFrame`` for a
+          captioning/OCR app.
+        * ``origins`` are the *finer* derivation grounding -- a list
+          of annotation ids the response was specifically derived
+          from (e.g. the ``TimePoint``\\s whose frames were fed to
+          the model). Written into ``TextDocument.origins``. See
+          https://clams.ai/clams-vocabulary/Document for vocabulary
+          semantics.
 
         :param view: the :class:`View` to write into. The caller is
             responsible for having called
             :meth:`View.new_contain` for ``TextDocument`` and
             ``Alignment`` first if needed.
-        :param source: ``long_id`` of the source annotation that
-            produced the response (e.g. a ``TimePoint`` or
-            ``ImageDocument``)
-        :param answer: the text generated by the model
-        :param trace: optional reasoning trace. NOT YET SUPPORTED —
-            passing a non-``None`` value raises
-            :py:class:`NotImplementedError`. Storage convention is
-            still being decided at
+        :param source: ``id`` of the annotation to record as the
+            cross-modal anchor of the response (see above).
+        :param response: the text generated by the model.
+        :param origins: optional list of ``id``\\s of annotations the
+            response was *derived* from. Must be paired with
+            ``origination``.
+        :param origination: nature of the derivation, written to
+            ``TextDocument.origination``. Accepted values per the
+            vocabulary include ``'derived'``, ``'transcription'``,
+            ``'topologically-identical'``. Must be paired with
+            ``origins``.
+        :param reasoning_trace: optional model-side reasoning trace
+            (a chain-of-thought / scratchpad string, NOT a Python
+            traceback). NOT YET SUPPORTED -- passing a non-``None``
+            value raises :py:class:`NotImplementedError`. Storage
+            convention is still being decided at
             clamsproject/clams-python#263.
         :return: ``(TextDocument, Alignment)`` tuple of the new
-            annotations
+            annotations.
+        :raises ValueError: if exactly one of ``origins`` /
+            ``origination`` is set; they must be supplied together
+            or both omitted.
         """
-        td = view.new_textdocument(text=answer)
+        if bool(origins) != bool(origination):
+            raise ValueError(
+                "`origins` and `origination` must be supplied together "
+                "or both omitted; got "
+                f"origins={origins!r}, origination={origination!r}."
+            )
+        td = view.new_textdocument(text=response)
+        if origins:
+            td.add_property('origins', origins)
+            td.add_property('origination', origination)
         align = view.new_annotation(
             AnnotationTypes.Alignment,
             source=source,
             target=td.id,
         )
-        if trace is not None:
+        if reasoning_trace is not None:
             raise NotImplementedError(
                 "Reasoning-trace storage convention is not yet defined; "
                 "tracked at clamsproject/clams-python#263."
diff --git a/documentation/app-baseclasses.rst b/documentation/app-baseclasses.rst
index 44d9ec3..afdfbbe 100644
--- a/documentation/app-baseclasses.rst
+++ b/documentation/app-baseclasses.rst
@@ -196,13 +196,22 @@ from :class:`~clams.app.ClamsApp`. These names are reserved; see
      - ``50``
      - no
      - Top-K sampling cutoff. Only meaningful when ``temperature`` > 0.
-   * - ``batchSize``
+   * - ``parallelPrompts``
      - integer
      - ``1``
      - no
-     - How many input items the app groups per ``generate`` call. GPU memory
-       scales roughly linearly with batch size; raise for throughput on
-       GPUs with headroom, keep at ``1`` on memory-tight setups.
+     - Number of independent prompts the app runs in parallel (stacks
+       into a single forward pass). The *size* of each prompt (how many
+       images, how long the system/user text is, etc.) is NOT regulated
+       by this parameter; that is each app's responsibility. Prompt
+       count and per-prompt content size combine multiplicatively for
+       GPU memory, so the two can blow up together. Catastrophic
+       example: ``tfSamplingMode=all`` on a TimeFrame without
+       ``targets`` expands that TF into one image per native-FPS frame
+       (300 images for a 10-second TF at 30fps); ``parallelPrompts=4``
+       then runs 4 such prompts in one forward pass (~1200 images),
+       guaranteed OOM. Keep at ``1`` on memory-tight setups; raise only
+       when per-prompt content is small and bounded.
 
 .. _promptable-customizing-defaults:
 
@@ -214,7 +223,7 @@ deliberately leaves ``prompt`` **without** a default; prompts are
 inherently app-specific and no single value is right for all apps.
 Beyond ``prompt``, other defaults may also be inappropriate for a given
 app: a model that needs longer outputs wants a higher ``maxNewTokens``,
-a small-VRAM deployment wants ``batchSize`` pinned at ``1``, etc.
+a small-VRAM deployment wants ``parallelPrompts`` pinned at ``1``, etc.
 
 Because the reservation rule prevents calling
 ``metadata.add_parameter('prompt', ...)`` (or any other promptable name)
@@ -336,15 +345,13 @@ Helpers
     may override to access model-specific state (e.g.
     ``self.processor``) when formatting messages.
 
-:meth:`~clams.app.ClamsPromptableApp.store_response`
-    Helper for the common annotation-creation pattern: given a view, a
-    source annotation's identifier, and a generated string, creates a
-    ``TextDocument`` containing the text plus an ``Alignment`` linking
-    source to TextDocument; returns the ``(text_document, alignment)``
-    pair. The optional ``trace`` parameter is reserved for
-    reasoning-trace storage; passing a non-``None`` value currently
-    raises :class:`NotImplementedError` (storage convention tracked in
-    clamsproject/clams-python#263).
+:meth:`~clams.app.ClamsPromptableApp.response_to_grounded_textdocument`
+    Writes a ``TextDocument`` plus an ``Alignment`` (``source -> TD``)
+    into a view. ``source`` is the coarse cross-modal anchor; the
+    optional ``origins`` (paired with ``origination``) is the finer
+    derivation list, written to the TD's ``origins`` / ``origination``
+    properties. See https://clams.ai/clams-vocabulary/Document for
+    vocabulary semantics.
 
 Backend helpers
 ^^^^^^^^^^^^^^^
diff --git a/documentation/runtime-params.rst b/documentation/runtime-params.rst
index 47aa8a5..cb8d65a 100644
--- a/documentation/runtime-params.rst
+++ b/documentation/runtime-params.rst
@@ -201,7 +201,7 @@ from :class:`~clams.app.ClamsPromptableApp` instead of
 :class:`~clams.app.ClamsApp`. The promptable base class adds a standardized,
 SDK-managed set of runtime parameters (``prompt``, ``systemPrompt``,
 ``temperature``, ``maxNewTokens``, ``topP``, ``topK``, ``promptMode``,
-``batchSize``) on top of the universal parameters. If you use this base
+``parallelPrompts``) on top of the universal parameters. If you use this base
 class, these names are reserved — your app's ``metadata.py`` must not
 redeclare them — and are added via a single helper call inside
 ``appmetadata()``.
diff --git a/tests/test_promptable.py b/tests/test_promptable.py
index 9311f21..4614ca3 100644
--- a/tests/test_promptable.py
+++ b/tests/test_promptable.py
@@ -303,7 +303,7 @@ def _walk(x):
 
 
 # ---------------------------------------------------------------------------
-# store_response
+# response_to_grounded_textdocument
 # ---------------------------------------------------------------------------
 
 class TestStoreResponse(unittest.TestCase):
@@ -317,22 +317,49 @@ def setUp(self):
         self.view.new_contain(AnnotationTypes.Alignment)
 
     def test_happy_path_creates_textdocument_and_alignment(self):
-        td, align = self.app.store_response(
-            self.view, source='src1', answer='generated text')
+        td, align = self.app.response_to_grounded_textdocument(
+            self.view, source='src1', response='generated text')
         self.assertEqual(td.text_value, 'generated text')
         self.assertEqual(align.get_property('source'), 'src1')
         self.assertEqual(align.get_property('target'), td.id)
 
-    def test_trace_none_does_not_raise(self):
+    def test_reasoning_trace_none_does_not_raise(self):
         # no exception
-        self.app.store_response(
-            self.view, source='src1', answer='text', trace=None)
+        self.app.response_to_grounded_textdocument(
+            self.view, source='src1', response='text',
+            reasoning_trace=None)
 
-    def test_trace_not_none_raises_not_implemented(self):
+    def test_reasoning_trace_not_none_raises_not_implemented(self):
         with self.assertRaises(NotImplementedError):
-            self.app.store_response(
-                self.view, source='src1', answer='text',
-                trace='intermediate reasoning')
+            self.app.response_to_grounded_textdocument(
+                self.view, source='src1', response='text',
+                reasoning_trace='intermediate reasoning')
+
+    # TODO (krim @ 05/28/26): this test case belongs upstream in the
+    # vocabulary type definition (the `origins`/`origination` pairing
+    # is a property of the `Document` type, per clams-vocabulary#18,
+    # not a behavior of the SDK app layer). Move once clams-vocabulary
+    # supports conditional prop validation. For now, this is a sanity
+    # check that the SDK correctly forwards both kwargs through to the
+    # underlying TD.
+    def test_origins_and_origination_written_together(self):
+        td, align = self.app.response_to_grounded_textdocument(
+            self.view, source='tf1', response='caption text',
+            origins=['tp1'], origination='derived')
+        self.assertEqual(td.get_property('origins'), ['tp1'])
+        self.assertEqual(td.get_property('origination'), 'derived')
+        self.assertEqual(align.get_property('source'), 'tf1')
+        self.assertEqual(align.get_property('target'), td.id)
+
+    def test_unpaired_origins_or_origination_raises(self):
+        unpaired = [
+            {'origins': ['tp1']},
+            {'origination': 'derived'},
+        ]
+        for kwargs in unpaired:
+            with self.subTest(**kwargs), self.assertRaises(ValueError):
+                self.app.response_to_grounded_textdocument(
+                    self.view, source='src1', response='text', **kwargs)
 
 
 # ---------------------------------------------------------------------------

From 5364a36ec705f6482d0c3aaad86aac133ee3739f Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Thu, 28 May 2026 16:00:29 -0400
Subject: [PATCH 06/10] added HF-specific "promptable" subclass

---
 clams/app/__init__.py                       | 289 +++++++++++++++++---
 clams/develop/templates/app/app.py.template |  45 ++-
 documentation/app-baseclasses.rst           | 143 +++++++++-
 pyproject.toml                              |   2 +-
 tests/test_promptable.py                    |  74 +++++
 5 files changed, 499 insertions(+), 54 deletions(-)

diff --git a/clams/app/__init__.py b/clams/app/__init__.py
index b004eea..b75233e 100644
--- a/clams/app/__init__.py
+++ b/clams/app/__init__.py
@@ -9,7 +9,7 @@
 from datetime import datetime
 from urllib import parse as urlparser
 
-__all__ = ['ClamsApp', 'ClamsPromptableApp']
+__all__ = ['ClamsApp', 'ClamsPromptableApp', 'ClamsHFPromptableApp']
 
 from typing import Union, Any, Optional, Dict, List, Tuple, cast
 
@@ -795,37 +795,51 @@ def generate(
             self,
             prompt: List[str],
             system_prompt: str = '',
-            images: Optional[List[Any]] = None,
-            audio: Optional[List[Any]] = None,
+            images: Optional[List[List[Any]]] = None,
+            audios: Optional[List[List[Any]]] = None,
             prompt_mode: str = 'turn-taking',
-            parallel_prompts: int = 1,
             **generation_params,
     ) -> List[str]:
         """
-        Run inference on the given prompt against the given inputs.
-        Subclasses MUST implement this.
-
-        The return value is a flat list of strings: one entry per input
-        item (one per image when ``images`` is given, one per audio clip
-        when ``audio`` is given, or a singleton for text-only
-        single-shot generation).
-
-        :param prompt: a ``List[str]`` of prompt turns. A single-element
-            list is one-shot. A multi-element list is multi-turn and is
-            assembled according to ``prompt_mode``.
-        :param system_prompt: optional system-role text prepended to the
-            conversation
-        :param images: optional list of input images to broadcast across
-            the prompt (one generation per image)
-        :param audio: optional list of input audio clips
-        :param prompt_mode: ``"turn-taking"`` (default) or ``"user-only"``;
-            see :py:attr:`promptable_parameters`
-        :param parallel_prompts: max number of independent prompts the
-            underlying call stacks into one forward pass
+        Run N independent prompts in one inference call and return N
+        outputs. Subclasses MUST implement this.
+
+        Each inner list of ``images`` / ``audios`` is the bundled
+        multimodal content for ONE prompt -- the model sees those
+        items as one composite input and produces one output. The
+        outer list spans N prompts processed in parallel (when the
+        backend supports it; sequentially otherwise).
+
+        * Single-prompt call: ``images=[[img1, img2]]`` -> one output
+          (composite over the two bundled images).
+        * Per-input broadcast: ``images=[[img1], [img2], [img3]]`` ->
+          three outputs (one per image). Caller assembles the
+          singleton-wrap shape.
+        * Multimodal pair: ``images=[[img1]], audios=[[au1]]`` -> one
+          output. When both ``images`` and ``audios`` are given they
+          must have the same outer length; index ``i`` of each pairs
+          into prompt ``i``.
+
+        :param prompt: a ``List[str]`` of prompt turns. A
+            single-element list is one-shot. A multi-element list is
+            multi-turn and is assembled according to ``prompt_mode``.
+        :param system_prompt: optional system-role text prepended to
+            the conversation. Applies to every prompt in the batch.
+        :param images: optional ``List[List[Any]]`` -- N groups, one
+            per prompt; each inner list is the bundled images for that
+            prompt.
+        :param audios: optional ``List[List[Any]]`` -- N groups, one
+            per prompt; each inner list is the bundled audio clips
+            for that prompt.
+        :param prompt_mode: ``"turn-taking"`` (default) or
+            ``"user-only"``; see :py:attr:`promptable_parameters`.
         :param generation_params: any additional backend-specific
             generation kwargs (``maxNewTokens``, ``temperature``,
-            ``topP``, ``topK``, etc.)
-        :return: one generated string per input item
+            ``topP``, ``topK``, etc.).
+        :return: a ``List[str]`` with one entry per prompt in the
+            batch. For ``prompt_mode='user-only'`` multi-turn, each
+            prompt's entry is the assistant's final reply across its
+            N user turns.
         :rtype: List[str]
         """
         raise NotImplementedError
@@ -835,7 +849,7 @@ def build_conversation(
             prompt: Union[str, List[str], List[dict]],
             system_prompt: str = '',
             images: Optional[List[Any]] = None,
-            audio: Optional[List[Any]] = None,
+            audios: Optional[List[Any]] = None,
             prompt_mode: str = 'turn-taking',
     ) -> Union[List[dict], List[List[dict]]]:
         """
@@ -850,7 +864,7 @@ def build_conversation(
         :param images: optional list of image inputs to include in the
             (final) user turn's content. Each appears as a
             ``{'type': 'image', 'image': <input>}`` entry.
-        :param audio: optional list of audio inputs to include in the
+        :param audios: optional list of audio inputs to include in the
             (final) user turn's content. Each appears as a
             ``{'type': 'audio', 'audio': <input>}`` entry.
         :param prompt_mode: ``"turn-taking"`` (default) or
@@ -888,46 +902,46 @@ def build_conversation(
 
         if len(prompts) == 1:
             return self._build_single_turn(
-                prompts[0], system_prompt, images, audio)
+                prompts[0], system_prompt, images, audios)
 
         if prompt_mode == 'turn-taking':
             return self._build_turn_taking(
-                prompts, system_prompt, images, audio)
+                prompts, system_prompt, images, audios)
         if prompt_mode == 'user-only':
             return self._build_user_only(
-                prompts, system_prompt, images, audio)
+                prompts, system_prompt, images, audios)
         raise ValueError(
             f"Unknown prompt_mode: {prompt_mode!r}. "
             f"Expected 'turn-taking' or 'user-only'.")
 
     @staticmethod
-    def _make_user_content(text, images=None, audio=None):
+    def _make_user_content(text, images=None, audios=None):
         """Build the content list for a user-role message."""
         content = []
         if images:
             for img in images:
                 content.append({'type': 'image', 'image': img})
-        if audio:
-            for au in audio:
+        if audios:
+            for au in audios:
                 content.append({'type': 'audio', 'audio': au})
         content.append({'type': 'text', 'text': text})
         return content
 
-    def _build_single_turn(self, text, system_prompt, images, audio):
+    def _build_single_turn(self, text, system_prompt, images, audios):
         messages = []
         if system_prompt:
             messages.append({'role': 'system', 'content': system_prompt})
         messages.append({
             'role': 'user',
-            'content': self._make_user_content(text, images, audio),
+            'content': self._make_user_content(text, images, audios),
         })
         return messages
 
-    def _build_turn_taking(self, prompts, system_prompt, images, audio):
+    def _build_turn_taking(self, prompts, system_prompt, images, audios):
         """
         Alternating user/assistant turns; one inference call.
         Even indices in ``prompts`` are user turns, odd indices are
-        pre-written assistant exemplars. Images/audio (if any) are
+        pre-written assistant exemplars. Images/audios (if any) are
         attached to the final user turn (the actual query).
         """
         messages = []
@@ -942,14 +956,14 @@ def _build_turn_taking(self, prompts, system_prompt, images, audio):
                 content = self._make_user_content(
                     text,
                     images if attach_media else None,
-                    audio if attach_media else None,
+                    audios if attach_media else None,
                 )
                 messages.append({'role': 'user', 'content': content})
             else:
                 messages.append({'role': 'assistant', 'content': text})
         return messages
 
-    def _build_user_only(self, prompts, system_prompt, images, audio):
+    def _build_user_only(self, prompts, system_prompt, images, audios):
         """
         N progressively-extending conversation prefixes, one per user
         turn. Assistant slots between users have ``content=None`` as
@@ -960,12 +974,12 @@ def _build_user_only(self, prompts, system_prompt, images, audio):
         if system_prompt:
             base.append({'role': 'system', 'content': system_prompt})
         for i, text in enumerate(prompts):
-            # First user turn carries the images/audio (the initial query);
+            # First user turn carries the images/audios (the initial query);
             # subsequent user turns are text-only.
             user_content = self._make_user_content(
                 text,
                 images if i == 0 else None,
-                audio if i == 0 else None,
+                audios if i == 0 else None,
             )
             base.append({'role': 'user', 'content': user_content})
             # Snapshot the conversation as it stands at the start of
@@ -1057,6 +1071,197 @@ def response_to_grounded_textdocument(
         return td, align
 
 
+class ClamsHFPromptableApp(ClamsPromptableApp):
+    """
+    Base class for promptable CLAMS apps backed by a local
+    HuggingFace ``transformers`` model. Layers HF-specific inference
+    plumbing on top of :class:`ClamsPromptableApp`: model loading
+    via :func:`clams.backends.hf.load_hf_model`, and a concrete
+    :py:meth:`generate` implementation that runs N independent
+    prompts in one HF forward pass via the standard
+    chat-template -> ``model.generate`` -> ``batch_decode`` pipeline.
+
+    Concrete subclasses declare the model via class attributes
+    (:py:attr:`MODEL_ID`, :py:attr:`MODEL_CLS`, etc.) and typically
+    only need to implement :py:meth:`_annotate` -- the per-app MMIF
+    I/O. Example::
+
+        class MyVLMCaptioner(ClamsHFPromptableApp):
+            MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+            MODEL_CLS = AutoModelForImageTextToText
+            DTYPE = torch.bfloat16
+            PADDING_SIDE = 'left'
+
+            def _annotate(self, mmif, **parameters):
+                # collect tasks from MMIF, build image groups, then
+                #   texts = self.generate(prompt, images=image_groups, ...)
+                # store responses via self.response_to_grounded_textdocument
+                ...
+
+    Requires the ``[hf]`` extra (``pip install clams-python[hf]``).
+    """
+
+    #: HuggingFace model identifier (Hub repo name or local path).
+    #: Subclasses MUST set this.
+    MODEL_ID: Optional[str] = None
+    #: ``transformers`` model class (e.g.
+    #: :class:`~transformers.AutoModelForImageTextToText`,
+    #: :class:`~transformers.AutoModelForCausalLM`). Subclasses MUST
+    #: set this.
+    MODEL_CLS: Optional[Any] = None
+    #: ``transformers`` processor / tokenizer / feature-extractor
+    #: class. Defaults to :class:`~transformers.AutoProcessor` (set
+    #: by :func:`clams.backends.hf.load_hf_model` when ``None``).
+    PROCESSOR_CLS: Optional[Any] = None
+    #: Torch dtype for the model (e.g. ``torch.bfloat16``). When
+    #: ``None``, the model class's own default is used (typically
+    #: float32). Also used to cast ``pixel_values`` in
+    #: :py:meth:`generate`.
+    DTYPE: Optional[Any] = None
+    #: Tokenizer padding side. Set to ``'left'`` for decoder-only
+    #: batched generation; leave ``None`` otherwise.
+    PADDING_SIDE: Optional[str] = None
+    #: Extra kwargs forwarded to ``MODEL_CLS.from_pretrained()``.
+    MODEL_KWARGS: Optional[dict] = None
+    #: Extra kwargs forwarded to ``PROCESSOR_CLS.from_pretrained()``.
+    PROCESSOR_KWARGS: Optional[dict] = None
+
+    def __init__(self):
+        super().__init__()
+        cls_name = type(self).__name__
+        if self.MODEL_ID is None:
+            raise ValueError(
+                f"{cls_name} must set the ``MODEL_ID`` class attribute "
+                f"(a HuggingFace model identifier).")
+        if self.MODEL_CLS is None:
+            raise ValueError(
+                f"{cls_name} must set the ``MODEL_CLS`` class attribute "
+                f"(a ``transformers`` model class).")
+        # Lazy import: avoids pulling torch/transformers into the base
+        # clams-python install. Apps using this class must have the
+        # ``[hf]`` extra installed.
+        from clams.backends.hf import load_hf_model
+        self.logger.info(f"Loading HF model from {self.MODEL_ID}")
+        self.processor, self.model, self.device = load_hf_model(
+            self.MODEL_ID,
+            self.MODEL_CLS,
+            processor_cls=self.PROCESSOR_CLS,
+            dtype=self.DTYPE,
+            padding_side=self.PADDING_SIDE,
+            model_kwargs=self.MODEL_KWARGS,
+            processor_kwargs=self.PROCESSOR_KWARGS,
+        )
+        self.logger.info(f"HF model loaded on {self.device}")
+
+    def generate(
+            self,
+            prompt: List[str],
+            system_prompt: str = '',
+            images: Optional[List[List[Any]]] = None,
+            audios: Optional[List[List[Any]]] = None,
+            prompt_mode: str = 'turn-taking',
+            **generation_params,
+    ) -> List[str]:
+        """
+        Default implementation of the
+        :py:meth:`ClamsPromptableApp.generate` contract for
+        HuggingFace ``transformers`` models. Runs N prompts in one
+        forward pass; returns N decoded strings.
+
+        Each inner list of ``images`` / ``audios`` is the bundled
+        content for one prompt. When both ``images`` and ``audios``
+        are given they must have the same outer length (multimodal
+        pairs are stitched by index). When both are ``None``, runs as
+        a single text-only prompt.
+
+        The default body is the canonical HF chat-model pipeline:
+        :py:meth:`build_conversation` -> ``apply_chat_template`` ->
+        ``model.generate`` -> ``batch_decode``. Subclasses can
+        customize finer-grained pieces via
+        :py:meth:`build_conversation` (model-specific message shape)
+        and :py:meth:`build_gen_kwargs` (model-specific generation
+        kwargs) without touching this method.
+        """
+        if images is not None and audios is not None:
+            if len(images) != len(audios):
+                raise ValueError(
+                    f"images and audios must have the same outer length "
+                    f"when both are given; got "
+                    f"{len(images)} vs {len(audios)}.")
+        if images is not None:
+            n = len(images)
+        elif audios is not None:
+            n = len(audios)
+        else:
+            n = 1  # text-only single prompt
+        if n == 0:
+            return []
+        gen_kwargs = self.build_gen_kwargs(**generation_params)
+        try:
+            conversations = [
+                self.build_conversation(
+                    prompt, system_prompt=system_prompt,
+                    images=images[i] if images is not None else None,
+                    audios=audios[i] if audios is not None else None,
+                    prompt_mode=prompt_mode)
+                for i in range(n)
+            ]
+            inputs = self.processor.apply_chat_template(
+                conversations,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                padding=True,
+                return_tensors="pt",
+            )
+            inputs = inputs.to(self.device)
+            if (self.DTYPE is not None
+                    and 'pixel_values' in inputs
+                    and inputs['pixel_values'] is not None):
+                inputs['pixel_values'] = inputs['pixel_values'].to(
+                    dtype=self.DTYPE)
+            generated_ids = self.model.generate(**inputs, **gen_kwargs)
+            input_len = inputs.input_ids.shape[1]
+            new_tokens = generated_ids[:, input_len:]
+            return self.processor.batch_decode(
+                new_tokens, skip_special_tokens=True)
+        except Exception as e:
+            self.logger.error(
+                f"Error processing batch: {e}", exc_info=True)
+            return [''] * n
+
+    @staticmethod
+    def build_gen_kwargs(
+            max_new_tokens: int = 512,
+            temperature: float = 0.0,
+            top_p: float = 1.0,
+            top_k: int = 50,
+            **_unused,
+    ) -> dict:
+        """
+        Translate the SDK's promptable-parameter values into
+        HuggingFace ``model.generate()`` kwargs. Greedy decoding
+        (``do_sample=False``) when ``temperature == 0.0``; sampled
+        decoding with the given ``top_p`` / ``top_k`` otherwise.
+
+        Subclasses MAY override to add model-specific generation
+        kwargs (``num_beams``, ``repetition_penalty``, custom
+        stopping criteria, ``do_sample`` overrides, etc.). The base
+        implementation accepts any extra keyword args and silently
+        ignores them, so subclasses can pass through the full
+        ``**parameters`` dict from ``_annotate`` without filtering.
+        """
+        gen_kwargs = {'max_new_tokens': max_new_tokens}
+        if temperature > 0:
+            gen_kwargs.update({
+                'do_sample': True,
+                'temperature': temperature,
+                'top_p': top_p,
+                'top_k': top_k,
+            })
+        return gen_kwargs
+
+
 class ParameterCaster(object):
 
     def __init__(self, param_spec: Dict[str, Tuple[str, bool]]):
diff --git a/clams/develop/templates/app/app.py.template b/clams/develop/templates/app/app.py.template
index d307ab4..35a5a95 100644
--- a/clams/develop/templates/app/app.py.template
+++ b/clams/develop/templates/app/app.py.template
@@ -24,13 +24,46 @@ from mmif import Mmif, View, Annotation, Document, AnnotationTypes, DocumentType
 from lapps.discriminators import Uri
 
 
-# If your app is a prompt-driven LLM/VLM/audio-LM app, change the base class
-# below from ``ClamsApp`` to ``ClamsPromptableApp`` (import via
-# ``from clams import ClamsPromptableApp``) and implement ``generate()``
-# instead of (or in addition to) ``_annotate()``. Don't forget to also
-# uncomment the matching helper call in ``metadata.py``. See
-# https://clams.ai/clams-python/app-baseclasses.html#promptable for the
+# =============================================================================
+# Pick a base class for your app:
+#
+#   ClamsApp ............ default; the rest of this scaffold inherits from it.
+#                         Implement ``_annotate()``. That's it.
+#                         Choose for any non-LLM/VLM app: classical OCR /
+#                         ASR engines, classifiers, rule-based tools, etc.
+#
+#   ClamsPromptableApp .. for prompt-driven LLM/VLM/ALM/LMM apps wrapping a
+#                         non-HF backend (remote APIs like OpenAI/Anthropic,
+#                         vLLM, custom inference servers).
+#                         Implement: ``_annotate()`` + ``generate()``.
+#                         Import:
+#                             from clams import ClamsPromptableApp
+#                         Also: uncomment ``inject_promptable_parameters``
+#                         block in ``metadata.py``.
+#
+#   ClamsHFPromptableApp  for prompt-driven apps wrapping a local HuggingFace
+#                         ``transformers`` model (the typical VLM/LLM case).
+#                         Implement: ``_annotate()`` + declare class
+#                         attributes:
+#                             MODEL_ID = "<hf-model-id>"
+#                             MODEL_CLS = <transformers.AutoModelFor...>
+#                             DTYPE = torch.bfloat16        # optional
+#                             PADDING_SIDE = 'left'          # optional
+#                         Import:
+#                             from clams.app import ClamsHFPromptableApp
+#                         Also: uncomment ``inject_promptable_parameters``
+#                         block in ``metadata.py``. Requires the ``[hf]``
+#                         extra: ``pip install clams-python[hf]``.
+#                         The base class provides ``__init__`` (loads the
+#                         model), ``generate()`` (HF batched inference),
+#                         ``build_conversation`` (chat-template message
+#                         list), and ``build_gen_kwargs`` (HF
+#                         ``model.generate()`` kwargs); override the latter
+#                         two only for model-specific quirks.
+#
+# See https://clams.ai/clams-python/app-baseclasses.html for the full
 # developer guide.
+# =============================================================================
 class $APP_CLASS_NAME(ClamsApp):
 
     def __init__(self):
diff --git a/documentation/app-baseclasses.rst b/documentation/app-baseclasses.rst
index afdfbbe..b5f8cdf 100644
--- a/documentation/app-baseclasses.rst
+++ b/documentation/app-baseclasses.rst
@@ -278,10 +278,11 @@ The ``generate()`` contract
 Subclasses MUST implement :meth:`~clams.app.ClamsPromptableApp.generate`.
 See the method's docstring for the full signature and parameter semantics.
 
-The return value is a flat ``List[str]``: one entry per input item (one per
-image when ``images`` is given, one per audio clip when ``audio`` is given,
-or a single-element list for text-only single-shot generation). Keep
-inference logic inside ``generate()`` distinct from MMIF I/O; the latter
+The return value is a flat ``List[str]`` with one entry per prompt in the
+batch: the outer length of ``images`` (and/or ``audios``) determines N;
+``generate()`` returns ``N`` strings. For text-only single-shot calls
+(both ``images`` and ``audios`` ``None``), the return is a singleton list.
+Keep inference logic inside ``generate()`` distinct from MMIF I/O; the latter
 belongs in ``_annotate()`` (which calls ``self.generate()``).
 
 This separation is intentional: future SDK releases may provide default
@@ -340,7 +341,7 @@ Helpers
     list (or a ``List[List[dict]]`` of progressively-extending prefixes
     for ``user-only`` mode). Handles string and list prompt forms, the
     two ``promptMode`` semantics, the optional ``systemPrompt``, and
-    inlines ``images`` / ``audio`` into the (final) user turn. Accepts
+    inlines ``images`` / ``audios`` into the (final) user turn. Accepts
     a pre-built ``List[dict]`` and returns it unchanged. Subclasses
     may override to access model-specific state (e.g.
     ``self.processor``) when formatting messages.
@@ -420,6 +421,12 @@ extractor + classifier head) leave both at the defaults and pass any
 class-specific kwargs through ``model_kwargs`` /
 ``processor_kwargs``.
 
+For promptable apps specifically, the
+:class:`~clams.app.ClamsHFPromptableApp` base class (see
+:ref:`hf-promptable`) wraps this helper plus the standard inference
+loop, so most HF-backed VLM/LLM apps don't need to call
+:func:`load_hf_model` directly.
+
 Installation
 ~~~~~~~~~~~~
 
@@ -435,3 +442,129 @@ plain ``clams-python`` install can still import :mod:`clams.app` and
 :class:`~clams.app.ClamsPromptableApp` without those dependencies; the
 ``ImportError`` only fires when an app actually calls
 :func:`clams.backends.hf.load_hf_model`.
+
+.. _hf-promptable:
+
+HuggingFace Promptable Apps
+---------------------------
+
+For the very common case of "promptable CLAMS app + local HuggingFace
+``transformers`` model," the SDK provides
+:class:`~clams.app.ClamsHFPromptableApp`, a specialized subclass of
+:class:`~clams.app.ClamsPromptableApp` that absorbs all HF-specific
+inference boilerplate. Concrete apps inheriting from it declare the
+model via a few class attributes and typically only need to implement
+``_annotate()`` for their MMIF I/O.
+
+When to use
+^^^^^^^^^^^
+
+Choose :class:`~clams.app.ClamsHFPromptableApp` over plain
+:class:`~clams.app.ClamsPromptableApp` when your app:
+
+- wraps a local HuggingFace ``transformers`` model loadable via
+  ``from_pretrained()``, AND
+- runs the standard chat-template -> ``model.generate`` ->
+  ``batch_decode`` inference pipeline (every modern instruct-tuned
+  VLM/LLM in HF), AND
+- doesn't need bespoke pixel-value preprocessing or vision-token
+  stitching at inference time.
+
+If your app uses a remote API instead (OpenAI, Anthropic, etc.), or a
+non-HF local backend, inherit from
+:class:`~clams.app.ClamsPromptableApp` directly and implement
+:meth:`~clams.app.ClamsPromptableApp.generate` yourself.
+
+Class-attribute hooks
+^^^^^^^^^^^^^^^^^^^^^
+
+Concrete subclasses declare the model declaratively via class
+attributes; the base ``__init__`` reads them, calls
+:func:`load_hf_model`, and stores ``self.processor``, ``self.model``,
+``self.device``:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 22 60 18
+
+   * - Attribute
+     - Meaning
+     - Required
+   * - ``MODEL_ID``
+     - HuggingFace model identifier (Hub repo name or local path).
+     - yes
+   * - ``MODEL_CLS``
+     - ``transformers`` model class (e.g.
+       :class:`~transformers.AutoModelForImageTextToText`,
+       :class:`~transformers.AutoModelForCausalLM`).
+     - yes
+   * - ``PROCESSOR_CLS``
+     - Processor / tokenizer / feature-extractor class. Defaults to
+       :class:`~transformers.AutoProcessor`.
+     - no
+   * - ``DTYPE``
+     - Torch dtype for the model and for ``pixel_values`` casting in
+       :py:meth:`~clams.app.ClamsHFPromptableApp.generate`. E.g.
+       ``torch.bfloat16`` for low-precision LLM inference.
+     - no
+   * - ``PADDING_SIDE``
+     - Tokenizer padding side. ``'left'`` for decoder-only batched
+       generation; leave unset otherwise.
+     - no
+   * - ``MODEL_KWARGS`` / ``PROCESSOR_KWARGS``
+     - Extra kwargs forwarded to the respective
+       ``from_pretrained()`` calls (e.g.
+       ``trust_remote_code=True``).
+     - no
+
+What the base class provides
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- A default :py:meth:`~clams.app.ClamsHFPromptableApp.__init__` that
+  loads the model from the class attributes via
+  :func:`load_hf_model`.
+- A concrete :py:meth:`~clams.app.ClamsHFPromptableApp.generate` that
+  satisfies the :class:`~clams.app.ClamsPromptableApp` abstract
+  contract. Takes ``images`` / ``audios`` as ``List[List[Any]]``
+  (N groups, one per prompt) and runs all N prompts in one HF
+  forward pass; returns one decoded string per group. Apps call
+  this from ``_annotate`` to run their inference; per-image
+  broadcast is a singleton-wrap (``images=[[img] for img in
+  images]``), per-TF composite is one group of N images per TF.
+- A default
+  :py:meth:`~clams.app.ClamsHFPromptableApp.build_gen_kwargs` that
+  maps SDK promptable parameters (``maxNewTokens``, ``temperature``,
+  ``topP``, ``topK``) into HF ``model.generate()`` kwargs.
+  Subclasses may override to add model-specific kwargs
+  (``num_beams``, ``repetition_penalty``, custom stopping criteria,
+  etc.).
+
+Minimal subclass example
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+    from transformers import AutoModelForImageTextToText
+    import torch
+
+    from clams.app import ClamsHFPromptableApp
+
+
+    class MyVLMCaptioner(ClamsHFPromptableApp):
+        MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+        MODEL_CLS = AutoModelForImageTextToText
+        DTYPE = torch.bfloat16
+        PADDING_SIDE = 'left'
+
+        def _appmetadata(self):
+            pass  # defined in metadata.py
+
+        def _annotate(self, mmif, **parameters):
+            ...  # collect tasks from MMIF, build image groups, call
+                 # self.generate(prompt, images=image_groups, ...), then
+                 # store responses via self.response_to_grounded_textdocument
+
+The ``metadata.py`` for an :class:`~clams.app.ClamsHFPromptableApp`
+subclass is identical to a plain
+:class:`~clams.app.ClamsPromptableApp` -- the helper-call requirement
+and the parameter table are unchanged.
diff --git a/pyproject.toml b/pyproject.toml
index 44fd9de..048410d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ docs = ["sphinx>=7.0,<8.0", "furo", "m2r2", "sphinx-jsonschema"]
 test = ["pytype", "pytest", "pytest-cov", "pillow"]
 # Required for apps using the HuggingFace transformers backend
 # (clams.backends.hf). Heavy deps; opt-in only.
-hf = ["torch", "transformers", "pillow"]
+hf = ["torch", "transformers", "pillow", "tqdm"]
 
 [tool.setuptools.packages.find]
 where = ["."]
diff --git a/tests/test_promptable.py b/tests/test_promptable.py
index 4614ca3..5426c14 100644
--- a/tests/test_promptable.py
+++ b/tests/test_promptable.py
@@ -386,5 +386,79 @@ def test_single_element_prompt_still_list(self):
         self.assertEqual(refined['prompt'], ['only'])
 
 
+# ---------------------------------------------------------------------------
+# ClamsHFPromptableApp class-attribute validation
+# ---------------------------------------------------------------------------
+
+class TestHFPromptableAppClassAttrs(unittest.TestCase):
+    """
+    Exercises the class-attribute validation in
+    :class:`ClamsHFPromptableApp.__init__`. The actual model loading
+    is patched out so these tests don't require torch/transformers.
+    End-to-end inference tests live separately.
+    """
+
+    def _make_subclass(self, *, model_id=None, model_cls=None, **extra_attrs):
+        attrs = {
+            '_load_appmetadata': lambda self: make_metadata(call_helper=True),
+            '_appmetadata': lambda self: None,
+            '_annotate': lambda self, mmif, **kw: mmif,
+            'MODEL_ID': model_id,
+            'MODEL_CLS': model_cls,
+        }
+        attrs.update(extra_attrs)
+        from clams.app import ClamsHFPromptableApp
+        return type('TestHFApp', (ClamsHFPromptableApp,), attrs)
+
+    def test_missing_model_id_raises(self):
+        cls = self._make_subclass(model_id=None, model_cls=object)
+        with self.assertRaises(ValueError) as ctx:
+            cls()
+        self.assertIn('MODEL_ID', str(ctx.exception))
+
+    def test_missing_model_cls_raises(self):
+        cls = self._make_subclass(model_id='fake-id', model_cls=None)
+        with self.assertRaises(ValueError) as ctx:
+            cls()
+        self.assertIn('MODEL_CLS', str(ctx.exception))
+
+    def test_loads_via_load_hf_model_with_class_attrs(self):
+        """
+        Patches ``clams.backends.hf.load_hf_model`` and verifies the
+        base ``__init__`` forwards the declared class attributes to it.
+        """
+        import clams.backends.hf as hf_module
+        original = hf_module.load_hf_model
+        captured = {}
+
+        def fake_load(model_id, model_cls, **kwargs):
+            captured['model_id'] = model_id
+            captured['model_cls'] = model_cls
+            captured.update(kwargs)
+            return ('FAKE_PROCESSOR', 'FAKE_MODEL', 'cpu')
+
+        try:
+            hf_module.load_hf_model = fake_load
+            cls = self._make_subclass(
+                model_id='org/fake-model',
+                model_cls=object,
+                DTYPE='FAKE_DTYPE',
+                PADDING_SIDE='left',
+                MODEL_KWARGS={'trust_remote_code': True},
+            )
+            app = cls()
+            self.assertEqual(app.processor, 'FAKE_PROCESSOR')
+            self.assertEqual(app.model, 'FAKE_MODEL')
+            self.assertEqual(app.device, 'cpu')
+            self.assertEqual(captured['model_id'], 'org/fake-model')
+            self.assertIs(captured['model_cls'], object)
+            self.assertEqual(captured['dtype'], 'FAKE_DTYPE')
+            self.assertEqual(captured['padding_side'], 'left')
+            self.assertEqual(
+                captured['model_kwargs'], {'trust_remote_code': True})
+        finally:
+            hf_module.load_hf_model = original
+
+
 if __name__ == '__main__':
     unittest.main()

From 7bc91d97ff0af2a2253ca24b00afa8b554ad5849 Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Fri, 29 May 2026 16:06:25 -0400
Subject: [PATCH 07/10] added timeframe filtering util recipe as `clams develop
 --recipe utl-tf`

---
 clams/develop/__init__.py                     |  18 ++-
 .../templates/utl-tf/__init__.py.template     |   0
 .../templates/utl-tf/timeframe.py.template    | 144 ++++++++++++++++++
 3 files changed, 160 insertions(+), 2 deletions(-)
 create mode 100644 clams/develop/templates/utl-tf/__init__.py.template
 create mode 100644 clams/develop/templates/utl-tf/timeframe.py.template

diff --git a/clams/develop/__init__.py b/clams/develop/__init__.py
index 4925780..5cbd84f 100644
--- a/clams/develop/__init__.py
+++ b/clams/develop/__init__.py
@@ -18,7 +18,13 @@
         'description': 'GtiHub Actions workflow files specific to `clamsproject` GitHub organization',
         'sourcedir': 'gha',
         'targetdir': '.github',
-    }
+    },
+    'utl-tf': {
+        'description': 'Local helper module for iterating TimeFrames and collecting per-TF frame tasks '
+                       '(baked into ``utils/timeframe.py``; backend-agnostic, safe to edit/delete)',
+        'sourcedir': 'utl-tf',
+        'targetdir': 'utils',
+    },
 }
 
 
@@ -65,12 +71,20 @@ def bake(self, update_level=0):
             if recipe == 'gha':
                 # There's nothing for devs to tweak GHA template, so first generation and updating are the same.
                 self.bake_gha(src_dir, dst_dir)
+            if recipe.startswith('utl-'):
+                # Utility recipes bake static helper modules; once baked the
+                # code is local to the app and devs are free to edit. No
+                # templating-variable substitution is needed -- pass an
+                # empty dict so ``safe_substitute`` is a no-op.
+                if dst_dir.exists() and update_level == 0:
+                    raise FileExistsError(f"  {dst_dir} already exists. Did you mean `--update`? ")
+                self.bake_app(src_dir, dst_dir, {})
             
     def bake_app(self, src_dir, dst_dir, templating_vars):
         for g in src_dir.glob("**/*.template"):
             r = g.relative_to(src_dir).parent
             f = g.with_suffix('').name
-            (dst_dir / r).mkdir(exist_ok=True)
+            (dst_dir / r).mkdir(parents=True, exist_ok=True)
             
             with open(g, 'r') as in_f, open(dst_dir/r/f, 'w') as out_f:
                 tmpl_to_compile = Template(in_f.read())
diff --git a/clams/develop/templates/utl-tf/__init__.py.template b/clams/develop/templates/utl-tf/__init__.py.template
new file mode 100644
index 0000000..e69de29
diff --git a/clams/develop/templates/utl-tf/timeframe.py.template b/clams/develop/templates/utl-tf/timeframe.py.template
new file mode 100644
index 0000000..8903144
--- /dev/null
+++ b/clams/develop/templates/utl-tf/timeframe.py.template
@@ -0,0 +1,144 @@
+"""
+TimeFrame iteration / frame-sampling helpers, local to this app.
+
+Generated by ``clams develop -r utl-tf``. The code in this file is part
+of your app, not the SDK; edit it freely, refactor as needed, or delete
+the whole file if your app does not iterate TimeFrame annotations.
+
+The functions here factor out the canonical pattern that any CLAMS app
+processing video by TimeFrames tends to write:
+
+  1. iterate TimeFrame annotations across input views, optionally
+     filtered by label
+  2. sample frames per TF using the universal ``tfSamplingMode``
+     parameter (representative TimePoints, the middle representative,
+     or every target / native-FPS frame)
+  3. when ``vdh`` returns a fallback timestamp (milliseconds, no
+     existing TP behind it), mint a fresh ``TimePoint`` annotation in
+     the app's new view so downstream code has a stable anchor id
+  4. assemble per-TF task tuples that downstream batching /
+     inference / annotation code can consume uniformly
+
+The helpers are backend-agnostic: tasks can feed a HuggingFace VLM, a
+remote LLM API, a classical CV pipeline, or any other per-frame
+processor. They have no dependency on ``clams.app.ClamsPromptableApp``
+or any other promptable / inference machinery.
+"""
+from typing import Any, Iterator, List, Optional, Tuple, Union
+
+from mmif import Annotation, Document, Mmif, View, AnnotationTypes
+from mmif.utils import video_document_helper as vdh
+
+
+def iter_timeframes(
+        mmif: Mmif, tflabels_of_interest: List[str],
+) -> Iterator[Annotation]:
+    """
+    Yield every TimeFrame annotation in ``mmif``, filtered by
+    ``tflabels_of_interest`` when non-empty.
+
+    :param mmif: the input MMIF object.
+    :param tflabels_of_interest: when non-empty, only TFs whose
+        ``label`` property matches one of these are yielded. An
+        empty list (the default in most apps) yields every TF
+        regardless of label.
+    """
+    for view in mmif.get_all_views_contain(AnnotationTypes.TimeFrame):
+        for tf in view.get_annotations(AnnotationTypes.TimeFrame):
+            if (tflabels_of_interest
+                    and tf.get_property('label') not in tflabels_of_interest):
+                continue
+            yield tf
+
+
+def to_timepoints(
+        parent_view: View,
+        video_doc: Document,
+        sources: List[Union[str, int]],
+) -> List[str]:
+    """
+    Normalize a list of frame ``sources`` (as returned by
+    :func:`vdh.extract_images_by_mode_with_sources`) into a parallel
+    list of TimePoint ``id``\\ s.
+
+    Each ``source`` is either:
+
+    * ``str`` -- the id of an existing TimePoint annotation
+      (representative / target). Passed through unchanged.
+    * ``int`` -- a millisecond timestamp returned by ``vdh`` for the
+      interval-fallback case (``tfSamplingMode=single`` with no
+      representatives, or ``tfSamplingMode=all`` with no targets).
+      A fresh ``TimePoint`` annotation is minted in ``parent_view``
+      at this timestamp; the new annotation's id is returned.
+
+    The ``TimePoint`` type is registered with
+    ``parent_view.new_contain()`` lazily on the first mint, so apps
+    that never hit the fallback path do not get an empty
+    ``TimePoint`` entry in their view metadata.
+
+    :param parent_view: the view this app is writing into; receives
+        any freshly-minted TimePoints.
+    :param video_doc: the source VideoDocument; recorded as
+        ``document`` on each minted TimePoint.
+    :param sources: per-frame source identifiers from ``vdh``.
+    :return: a list of TimePoint ids, parallel to ``sources``.
+    """
+    tp_contain_registered = False
+    out: List[str] = []
+    for src in sources:
+        if isinstance(src, str):
+            out.append(src)
+        else:
+            if not tp_contain_registered:
+                parent_view.new_contain(AnnotationTypes.TimePoint)
+                tp_contain_registered = True
+            tp = parent_view.new_annotation(
+                AnnotationTypes.TimePoint,
+                document=video_doc.id,
+                timePoint=int(src),
+                timeUnit='milliseconds',
+            )
+            out.append(tp.id)
+    return out
+
+
+def collect_timeframes_of_interest(
+        mmif: Mmif,
+        parent_view: View,
+        video_doc: Document,
+        tflabels_of_interest: List[str],
+) -> List[Tuple[List[Any], List[str], str, Optional[str]]]:
+    """
+    Convenience composition of :func:`iter_timeframes`,
+    :func:`vdh.extract_images_by_mode_with_sources`, and
+    :func:`to_timepoints`. Returns one
+    ``(images, tp_ids, tf_id, tf_label)`` task per matching TimeFrame
+    that produced at least one sampled frame.
+
+    Each task's ``images`` and ``tp_ids`` are parallel lists -- one
+    entry per frame sampled from that TF (length 1 for
+    ``tfSamplingMode=single``, N for ``representatives`` / ``all``).
+    Each entry of ``tp_ids`` is either the id of an existing
+    TimePoint or the id of a freshly-minted one (see
+    :func:`to_timepoints`). ``tf_label`` is the source TimeFrame's
+    ``label`` property value, or ``None`` if unset.
+
+    :param mmif: the input MMIF.
+    :param parent_view: the view this app is writing into.
+    :param video_doc: the source VideoDocument that frames are
+        extracted from.
+    :param tflabels_of_interest: optional label filter; empty list =
+        no filter.
+    :return: per-TF task tuples, ready to feed a batched inference
+        loop or any other per-frame processor.
+    """
+    tasks: List[Tuple[List[Any], List[str], str, Optional[str]]] = []
+    for tf in iter_timeframes(mmif, tflabels_of_interest):
+        images, sources = vdh.extract_images_by_mode_with_sources(
+            mmif, tf, as_PIL=True)
+        if not images:
+            continue
+        tp_ids = to_timepoints(parent_view, video_doc, sources)
+        tf_label = tf.get_property('label')
+        tasks.append((list(images), tp_ids, tf.id, tf_label))
+    return tasks

From 5022a0e46a4ff9ed7fbad670c2d8c6aad9f73333 Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Fri, 29 May 2026 15:15:14 -0400
Subject: [PATCH 08/10] added a better metadata-parameter integration for
 HF-based multi-model apps

---
 clams/app/__init__.py                         | 206 +++++++++++--
 clams/backends/hf.py                          |  25 +-
 clams/develop/templates/app/app.py.template   |  33 +-
 .../templates/app/metadata.py.template        |  24 +-
 documentation/app-baseclasses.rst             | 283 ++++++++----------
 documentation/introduction.rst                |   2 +-
 documentation/runtime-params.rst              |  20 +-
 documentation/tutorial.md                     |  10 +-
 tests/test_promptable.py                      | 185 ++++++++++--
 9 files changed, 524 insertions(+), 264 deletions(-)

diff --git a/clams/app/__init__.py b/clams/app/__init__.py
index b75233e..88f612b 100644
--- a/clams/app/__init__.py
+++ b/clams/app/__init__.py
@@ -75,7 +75,7 @@ class ClamsApp(ABC):
         # how vdh.extract_frames_by_mode() selects frames from TimeFrames.
         # The value is intercepted in annotate() and pushed into a
         # contextvars.ContextVar so that any vdh call inside _annotate()
-        # picks it up automatically — app developers never need to handle
+        # picks it up automatically; app developers never need to handle
         # this parameter themselves.
         {
             'name': 'tfSamplingMode', 'type': 'string',
@@ -643,15 +643,14 @@ def open_document_location(document: Union[str, Document], opener: Any = open, *
 # autodoc-based auto documentation export (e.g., ``automethod`` for
 # methods and a small Sphinx extension to render
 # ``promptable_parameters`` into the parameter table), instead of the
-# current hand-authored ``documentation/app-baseclasses.rst``. 
+# current hand-authored ``documentation/app-baseclasses.rst``.
 class ClamsPromptableApp(ClamsApp):
     """
     Base class for CLAMS apps that wrap a promptable model (an LLM or
     other multimodal model, local or remote). Standardizes the runtime
-    parameter surface
-    (prompt, generation hyperparameters, batch size) and provides
-    helpers for building chat conversations and persisting model
-    responses into MMIF.
+    parameter surface (prompt, generation hyperparameters, parallelism
+    control) and provides helpers for building chat conversations and
+    persisting model responses into MMIF.
 
     The standardized parameters are listed in
     :py:attr:`promptable_parameters` and added to an app's metadata via
@@ -665,17 +664,17 @@ class ClamsPromptableApp(ClamsApp):
     Inference is performed by :py:meth:`generate`, which subclasses MUST
     implement. The base class provides:
 
-    * :py:meth:`inject_promptable_parameters` — add the SDK-managed
+    * :py:meth:`inject_promptable_parameters` : adds the SDK-managed
       parameter set to ``AppMetadata``
-    * :py:meth:`build_conversation` — assemble a chat-template-compatible
-      message list (stub in this release)
-    * :py:meth:`response_to_grounded_textdocument` — persist a
+    * :py:meth:`build_conversation` : assembles a chat-template-compatible
+      message list from a prompt plus optional images/audios
+    * :py:meth:`response_to_grounded_textdocument` : persists a
       generated response into a view as ``TextDocument`` +
       ``Alignment`` (+ optional ``origins`` / ``origination``)
     """
 
     #: SDK-managed runtime parameters injected into every promptable app.
-    #: These names are reserved — apps cannot redeclare them with
+    #: These names are reserved; apps cannot redeclare them with
     #: customized specs.
     promptable_parameters = [
         {
@@ -774,7 +773,7 @@ def __init__(self):
         # ``inject_promptable_parameters()`` from inside
         # ``appmetadata()``. The parent ``__init__`` then iterates
         # ``self.metadata.parameters`` to populate
-        # ``annotate_param_spec`` and build the caster — so the
+        # ``annotate_param_spec`` and build the caster, so the
         # promptable parameters are already covered by the time we land
         # here. We only validate that the helper was actually called.
         super().__init__()
@@ -857,7 +856,7 @@ def build_conversation(
 
         :param prompt: a plain string, a ``List[str]`` of prompt turns,
             or a pre-built ``List[dict]`` of role/content message
-            objects (returned as-is — pass-through for advanced
+            objects (returned as-is; pass-through for advanced
             callers that constructed the conversation themselves).
         :param system_prompt: if non-empty, prepended as a
             system-role message.
@@ -1081,29 +1080,37 @@ class ClamsHFPromptableApp(ClamsPromptableApp):
     prompts in one HF forward pass via the standard
     chat-template -> ``model.generate`` -> ``batch_decode`` pipeline.
 
-    Concrete subclasses declare the model via class attributes
-    (:py:attr:`MODEL_ID`, :py:attr:`MODEL_CLS`, etc.) and typically
-    only need to implement :py:meth:`_annotate` -- the per-app MMIF
-    I/O. Example::
+    Concrete subclasses declare the model class via :py:attr:`MODEL_CLS`
+    plus a handful of optional dtype/padding hints, and the family of
+    pinned model revisions via ``analyzer_versions`` in
+    ``metadata.py``. The SDK auto-derives a ``model`` runtime
+    parameter (choices = keys of ``analyzer_versions``), and the dev's
+    ``_annotate`` calls :py:meth:`load_model` to (lazily) load the
+    requested family member. Singleton families (one entry in
+    ``analyzer_versions``) eagerly pre-load in ``__init__`` so
+    single-model apps preserve warm-start semantics. Example::
 
         class MyVLMCaptioner(ClamsHFPromptableApp):
-            MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
             MODEL_CLS = AutoModelForImageTextToText
             DTYPE = torch.bfloat16
             PADDING_SIDE = 'left'
 
+            # In metadata.py:
+            #     analyzer_versions={
+            #         "HuggingFaceTB/SmolVLM2-2.2B-Instruct": "482adb5",
+            #     }
+            # plus a call to
+            # ClamsHFPromptableApp.inject_promptable_parameters(metadata).
+
             def _annotate(self, mmif, **parameters):
-                # collect tasks from MMIF, build image groups, then
-                #   texts = self.generate(prompt, images=image_groups, ...)
-                # store responses via self.response_to_grounded_textdocument
+                self.load_model(parameters['model'])
+                # ... self.generate(prompt, images=image_groups, ...)
+                # ... self.response_to_grounded_textdocument(...)
                 ...
 
     Requires the ``[hf]`` extra (``pip install clams-python[hf]``).
     """
 
-    #: HuggingFace model identifier (Hub repo name or local path).
-    #: Subclasses MUST set this.
-    MODEL_ID: Optional[str] = None
     #: ``transformers`` model class (e.g.
     #: :class:`~transformers.AutoModelForImageTextToText`,
     #: :class:`~transformers.AutoModelForCausalLM`). Subclasses MUST
@@ -1126,32 +1133,167 @@ def _annotate(self, mmif, **parameters):
     #: Extra kwargs forwarded to ``PROCESSOR_CLS.from_pretrained()``.
     PROCESSOR_KWARGS: Optional[dict] = None
 
+    @staticmethod
+    def inject_promptable_parameters(metadata: AppMetadata) -> None:
+        """
+        Add the SDK-managed promptable parameters AND a ``model``
+        parameter derived from ``metadata.analyzer_versions`` to the
+        app metadata. Overrides
+        :py:meth:`ClamsPromptableApp.inject_promptable_parameters` for
+        HF apps; call this at the end of your app's ``appmetadata()``
+        function in ``metadata.py`` if your app subclasses
+        :py:class:`ClamsHFPromptableApp`.
+
+        :param metadata: the :class:`AppMetadata` instance being
+            built. ``metadata.analyzer_versions`` MUST already be set
+            to a non-empty ``Dict[str, str]`` (model id -> commit
+            hash); this helper reads it to derive the ``model``
+            parameter's choices.
+        :raises ValueError: if ``metadata.analyzer_versions`` is
+            missing or empty.
+        """
+        ClamsPromptableApp.inject_promptable_parameters(metadata)
+        analyzer_versions = metadata.analyzer_versions or {}
+        if not analyzer_versions:
+            raise ValueError(
+                "ClamsHFPromptableApp.inject_promptable_parameters "
+                "requires ``metadata.analyzer_versions`` to be a "
+                "non-empty dict (HF model id -> commit hash). Set "
+                "it on the ``AppMetadata`` constructor call before "
+                "invoking this helper.")
+        choices = list(analyzer_versions.keys())
+        default = choices[0] if len(choices) == 1 else None
+        metadata.add_parameter(
+            name='model',
+            type='string',
+            choices=choices,
+            default=default,
+            multivalued=False,
+            description=(
+                "HuggingFace model identifier to use for this "
+                "request. Must be one of the model ids declared in "
+                "this app's ``analyzer_versions``; the SDK pins the "
+                "corresponding commit hash at load time. When the "
+                "app ships a single model (the typical case), this "
+                "parameter defaults to that one model and can be "
+                "omitted. Pass the full HF model id (e.g. "
+                "``org/repo-name``); URL-encoding the ``/`` is "
+                "optional."
+            ),
+        )
+
     def __init__(self):
         super().__init__()
         cls_name = type(self).__name__
-        if self.MODEL_ID is None:
-            raise ValueError(
-                f"{cls_name} must set the ``MODEL_ID`` class attribute "
-                f"(a HuggingFace model identifier).")
         if self.MODEL_CLS is None:
             raise ValueError(
                 f"{cls_name} must set the ``MODEL_CLS`` class attribute "
                 f"(a ``transformers`` model class).")
+        analyzer_versions = self.metadata.analyzer_versions
+        if not analyzer_versions:
+            raise ValueError(
+                f"{cls_name} must declare ``analyzer_versions`` in "
+                f"``metadata.py`` as a non-empty Dict[str, str] "
+                f"mapping HuggingFace model ids to pinned commit "
+                f"hashes (7-char abbreviation is sufficient). This is "
+                f"required for reproducibility: an unpinned download "
+                f"silently floats on whatever ``main`` points at and "
+                f"cannot be reproduced. Singleton families (one "
+                f"entry) are fine; multi-model families list every "
+                f"member.")
+        if 'model' not in {p.name for p in self.metadata.parameters}:
+            raise ValueError(
+                f"{cls_name} must call "
+                f"``ClamsHFPromptableApp.inject_promptable_parameters"
+                f"(metadata)`` (the HF override that also adds the "
+                f"``model`` parameter) inside ``appmetadata()`` in "
+                f"``metadata.py``; calling "
+                f"``ClamsPromptableApp.inject_promptable_parameters`` "
+                f"directly skips the ``model`` parameter and trips "
+                f"this check.")
+        #: Per-(model_id, revision) cache of loaded
+        #: ``(processor, model, device)`` triples. Populated by
+        #: :py:meth:`load_model`; survives for the lifetime of this
+        #: app instance.
+        self._model_cache: Dict[Tuple[str, str], Tuple[Any, Any, str]] = {}
+        #: References to the currently-active loaded model. Set by
+        #: :py:meth:`load_model`; ``generate()`` and friends read
+        #: from here. ``None`` until the first ``load_model`` call
+        #: (or until ``__init__`` eager-loads a singleton family).
+        self.processor: Any = None
+        self.model: Any = None
+        self.device: Optional[str] = None
+        # Singleton families pre-load in ``__init__`` so single-model
+        # apps preserve warm-start UX (no first-request latency cost).
+        # Multi-member families defer to lazy loading on the first
+        # ``load_model`` call.
+        if len(analyzer_versions) == 1:
+            only_model_id = next(iter(analyzer_versions.keys()))
+            self.load_model(only_model_id)
+
+    def _refine_params(self, **runtime_params):
+        """
+        Expand ``model`` from the raw HF id (``org/name``) to
+        ``org/name@<revision>`` so the resolved revision lands in
+        ``view.metadata.appConfiguration['model']``.
+        """
+        refined = super()._refine_params(**runtime_params)
+        model_id = refined.get('model')
+        if isinstance(model_id, str) and '@' not in model_id:
+            revision = (self.metadata.analyzer_versions or {}).get(model_id)
+            if revision is not None:
+                refined['model'] = f"{model_id}@{revision}"
+        return refined
+
+    def load_model(
+            self, model_id_or_with_rev: str,
+    ) -> Tuple[Any, Any, str]:
+        """
+        Load (or return cached) ``(processor, model, device)`` for
+        the given model id. Accepts both refined (``org/name@rev``)
+        and raw (``org/name``) forms; for raw form, the revision is
+        looked up from ``self.metadata.analyzer_versions``. Caches
+        results per ``(model_id, revision)`` and updates
+        :py:attr:`self.processor`, :py:attr:`self.model`,
+        :py:attr:`self.device` to the loaded triple so subsequent
+        :py:meth:`generate` calls operate on it.
+
+        :param model_id_or_with_rev: HF model id, optionally with
+            ``@<revision>`` suffix.
+        :return: ``(processor, model, device)`` tuple for the loaded
+            model. Same references are also stored on ``self``.
+        :raises KeyError: if a raw model id is passed and is not in
+            ``analyzer_versions``.
+        """
+        if '@' in model_id_or_with_rev:
+            model_id, _, revision = model_id_or_with_rev.rpartition('@')
+        else:
+            model_id = model_id_or_with_rev
+            revision = self.metadata.analyzer_versions[model_id]
+        cache_key = (model_id, revision)
+        cached = self._model_cache.get(cache_key)
+        if cached is not None:
+            self.processor, self.model, self.device = cached
+            return cached
         # Lazy import: avoids pulling torch/transformers into the base
         # clams-python install. Apps using this class must have the
         # ``[hf]`` extra installed.
         from clams.backends.hf import load_hf_model
-        self.logger.info(f"Loading HF model from {self.MODEL_ID}")
-        self.processor, self.model, self.device = load_hf_model(
-            self.MODEL_ID,
+        self.logger.info(f"Loading HF model from {model_id} @ {revision}")
+        triple = load_hf_model(
+            model_id,
             self.MODEL_CLS,
             processor_cls=self.PROCESSOR_CLS,
             dtype=self.DTYPE,
             padding_side=self.PADDING_SIDE,
+            revision=revision,
             model_kwargs=self.MODEL_KWARGS,
             processor_kwargs=self.PROCESSOR_KWARGS,
         )
-        self.logger.info(f"HF model loaded on {self.device}")
+        self.logger.info(f"HF model loaded on {triple[2]}")
+        self._model_cache[cache_key] = triple
+        self.processor, self.model, self.device = triple
+        return triple
 
     def generate(
             self,
diff --git a/clams/backends/hf.py b/clams/backends/hf.py
index ca0420e..a92df58 100644
--- a/clams/backends/hf.py
+++ b/clams/backends/hf.py
@@ -4,7 +4,7 @@
 Provides :func:`load_hf_model`, a general loader that wraps the device,
 processor, dtype, and inference-mode boilerplate every HF-backed CLAMS
 app does identically. Usable for any model class that supports
-``from_pretrained()`` — instruction-tuned LLMs/VLMs, encoder-only
+``from_pretrained()``: instruction-tuned LLMs/VLMs, encoder-only
 classifiers, vision/audio feature extractors, etc.
 
 ``torch`` and ``transformers`` are optional dependencies. Install them
@@ -27,6 +27,7 @@ def load_hf_model(
         dtype=None,
         device: Optional[str] = None,
         padding_side: Optional[str] = None,
+        revision: Optional[str] = None,
         model_kwargs: Optional[dict] = None,
         processor_kwargs: Optional[dict] = None,
 ) -> Tuple[Any, Any, str]:
@@ -60,6 +61,21 @@ class with ``from_pretrained()``. Defaults to
         token is set -- uses the EOS token as the pad token. Leave
         ``None`` for encoder / non-batched cases (the tokenizer's own
         default is preserved).
+    :param revision: optional Git revision (commit hash, branch name,
+        or tag) on the Hub repository to pin the download to. When
+        set, forwarded as ``revision=...`` to both
+        ``model_cls.from_pretrained`` and
+        ``processor_cls.from_pretrained``, ensuring the model and
+        processor are loaded from the same commit. Strongly recommended
+        for production: pinning a commit hash makes the analyzer
+        artifact reproducible and immune to upstream silent updates.
+        Apps calling this helper directly should record the same hash
+        on ``analyzer_version`` (or ``analyzer_versions``) in
+        ``metadata.py`` so the output MMIF identifies the exact
+        artifact. Apps inheriting from
+        :class:`~clams.app.ClamsHFPromptableApp` do not call this
+        helper -- the base class reads ``analyzer_versions`` from the
+        app metadata and forwards the resolved revision automatically.
     :param model_kwargs: extra kwargs forwarded to
         ``model_cls.from_pretrained()`` (e.g.,
         ``{'use_safetensors': True, 'add_pooling_layer': False}``).
@@ -98,8 +114,11 @@ class with ``from_pretrained()``. Defaults to
         # default to AutoProcessor
         processor_cls = transformers.AutoProcessor
     if processor_cls is not None:
+        processor_load_kwargs = dict(processor_kwargs or {})
+        if revision is not None:
+            processor_load_kwargs.setdefault('revision', revision)
         processor = processor_cls.from_pretrained(
-            model_id, **(processor_kwargs or {}))
+            model_id, **processor_load_kwargs)
         if padding_side is not None:
             tokenizer = getattr(processor, 'tokenizer', processor)
             tokenizer.padding_side = padding_side
@@ -114,6 +133,8 @@ class with ``from_pretrained()``. Defaults to
     model_load_kwargs = dict(model_kwargs or {})
     if dtype is not None:
         model_load_kwargs['torch_dtype'] = dtype
+    if revision is not None:
+        model_load_kwargs.setdefault('revision', revision)
     model = model_cls.from_pretrained(model_id, **model_load_kwargs)
     model = model.to(resolved_device)
     model.eval()
diff --git a/clams/develop/templates/app/app.py.template b/clams/develop/templates/app/app.py.template
index 35a5a95..d2eb9b0 100644
--- a/clams/develop/templates/app/app.py.template
+++ b/clams/develop/templates/app/app.py.template
@@ -38,28 +38,33 @@ from lapps.discriminators import Uri
 #                         Implement: ``_annotate()`` + ``generate()``.
 #                         Import:
 #                             from clams import ClamsPromptableApp
-#                         Also: uncomment ``inject_promptable_parameters``
-#                         block in ``metadata.py``.
+#                         Also in ``metadata.py``: uncomment the
+#                         ``inject_promptable_parameters`` block.
 #
 #   ClamsHFPromptableApp  for prompt-driven apps wrapping a local HuggingFace
 #                         ``transformers`` model (the typical VLM/LLM case).
-#                         Implement: ``_annotate()`` + declare class
-#                         attributes:
-#                             MODEL_ID = "<hf-model-id>"
+#                         Implement: ``_annotate()`` (call
+#                         ``self.load_model(parameters['model'])`` first) +
+#                         declare class attributes:
 #                             MODEL_CLS = <transformers.AutoModelFor...>
 #                             DTYPE = torch.bfloat16        # optional
 #                             PADDING_SIDE = 'left'          # optional
 #                         Import:
 #                             from clams.app import ClamsHFPromptableApp
-#                         Also: uncomment ``inject_promptable_parameters``
-#                         block in ``metadata.py``. Requires the ``[hf]``
-#                         extra: ``pip install clams-python[hf]``.
-#                         The base class provides ``__init__`` (loads the
-#                         model), ``generate()`` (HF batched inference),
-#                         ``build_conversation`` (chat-template message
-#                         list), and ``build_gen_kwargs`` (HF
-#                         ``model.generate()`` kwargs); override the latter
-#                         two only for model-specific quirks.
+#                         Also in ``metadata.py``: set
+#                         ``analyzer_versions={<hf-id>: <commit-hash>, ...}``
+#                         on the ``AppMetadata`` call, and uncomment the
+#                         ``ClamsHFPromptableApp.inject_promptable_parameters``
+#                         block (the HF override of the plain helper).
+#                         Requires the ``[hf]`` extra:
+#                             pip install clams-python[hf]
+#                         Singleton ``analyzer_versions`` families pre-load
+#                         in ``__init__`` (warm start); multi-member
+#                         families load on the first ``load_model`` call
+#                         and cache thereafter. ``generate()``,
+#                         ``build_conversation``, and ``build_gen_kwargs``
+#                         have working defaults; override only for
+#                         model-specific quirks.
 #
 # See https://clams.ai/clams-python/app-baseclasses.html for the full
 # developer guide.
diff --git a/clams/develop/templates/app/metadata.py.template b/clams/develop/templates/app/metadata.py.template
index f9596fd..d34303a 100644
--- a/clams/develop/templates/app/metadata.py.template
+++ b/clams/develop/templates/app/metadata.py.template
@@ -53,19 +53,29 @@ def appmetadata() -> AppMetadata:
     # metadta.add_parameter(more...)
 
     # If your app subclasses ``ClamsPromptableApp`` (a prompt-driven LLM/VLM/audio-LM
-    # app), uncomment the following two lines to add the SDK-managed promptable
-    # parameters (prompt, systemPrompt, temperature, maxNewTokens, etc.) to your
-    # app's metadata. See
+    # app on a non-HF backend), uncomment the following two lines to add the
+    # SDK-managed promptable parameters (prompt, systemPrompt, temperature,
+    # maxNewTokens, etc.) to your app's metadata. See
     # https://clams.ai/clams-python/app-baseclasses.html#promptable for the
-    # developer guide. Reminder: these parameter names are reserved by the SDK
-    # — do not redeclare any of them above.
+    # developer guide. Reminder: these parameter names are reserved by the SDK;
+    # do not redeclare any of them above.
     # from clams.app import ClamsPromptableApp
     # ClamsPromptableApp.inject_promptable_parameters(metadata)
     #
+    # If your app subclasses ``ClamsHFPromptableApp`` (HF transformers backend),
+    # use the HF override of the same helper -- it injects the promptable
+    # parameters AND a ``model`` parameter derived from ``analyzer_versions``.
+    # Also set ``analyzer_versions={<hf-id>: <commit-hash>, ...}`` on the
+    # ``AppMetadata(...)`` call above (replaces the singular
+    # ``analyzer_version`` for HF apps). See
+    # https://clams.ai/clams-python/app-baseclasses.html#hf-promptable for details.
+    # from clams.app import ClamsHFPromptableApp
+    # ClamsHFPromptableApp.inject_promptable_parameters(metadata)
+    #
     # To customize the default value of any promptable parameter (e.g. provide an
-    # app-specific default ``prompt``, raise ``maxNewTokens``, pin ``batchSize``,
+    # app-specific default ``prompt``, raise ``maxNewTokens``, pin ``parallelPrompts``,
     # etc.), mutate the ``default`` field on the already-injected parameter
-    # object — the SDK does NOT allow re-declaring promptable param names. See
+    # object; the SDK does NOT allow re-declaring promptable param names. See
     # https://clams.ai/clams-python/app-baseclasses.html#promptable-customizing-defaults
     # for details. Example:
     # for p in metadata.parameters:
diff --git a/documentation/app-baseclasses.rst b/documentation/app-baseclasses.rst
index b5f8cdf..8e3a204 100644
--- a/documentation/app-baseclasses.rst
+++ b/documentation/app-baseclasses.rst
@@ -127,15 +127,10 @@ discriminative model), keep using :class:`~clams.app.ClamsApp` directly.
 
 .. note::
 
-   ``ClamsPromptableApp`` assumes an **instruction-tuned or chat-tuned**
-   model: one that has been fine-tuned to follow natural-language
-   instructions and that understands a system/user/assistant role
-   structure. The parameter
-   surface (``systemPrompt``, ``promptMode``'s turn-taking semantics, the
-   chat-template message list produced by ``build_conversation``) presupposes
-   this. Bare completion / next-token-prediction base models that have not
-   been instruction-tuned do not fit this base class cleanly; for those, use
-   :class:`~clams.app.ClamsApp` directly and design your own parameter surface.
+   ``ClamsPromptableApp`` assumes an **instruction- or chat-tuned**
+   model with a system/user/assistant role structure. Bare completion
+   / next-token-prediction base models do not fit this base class
+   cleanly; use :class:`~clams.app.ClamsApp` directly for those.
 
 Standardized runtime parameters
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -200,18 +195,13 @@ from :class:`~clams.app.ClamsApp`. These names are reserved; see
      - integer
      - ``1``
      - no
-     - Number of independent prompts the app runs in parallel (stacks
-       into a single forward pass). The *size* of each prompt (how many
-       images, how long the system/user text is, etc.) is NOT regulated
-       by this parameter; that is each app's responsibility. Prompt
-       count and per-prompt content size combine multiplicatively for
-       GPU memory, so the two can blow up together. Catastrophic
-       example: ``tfSamplingMode=all`` on a TimeFrame without
-       ``targets`` expands that TF into one image per native-FPS frame
-       (300 images for a 10-second TF at 30fps); ``parallelPrompts=4``
-       then runs 4 such prompts in one forward pass (~1200 images),
-       guaranteed OOM. Keep at ``1`` on memory-tight setups; raise only
-       when per-prompt content is small and bounded.
+     - Number of independent prompts the app stacks into a single
+       forward pass. Per-prompt content size is the app's
+       responsibility; prompt count and per-prompt size combine
+       multiplicatively for GPU memory. Keep at ``1`` on memory-tight
+       setups; see the parameter's own description in
+       :py:attr:`~clams.app.ClamsPromptableApp.promptable_parameters`
+       for an OOM-risk example.
 
 .. _promptable-customizing-defaults:
 
@@ -255,39 +245,54 @@ A promptable app requires two paired edits relative to the scaffold generated
 by ``clams develop``:
 
 1. In ``app.py``, change the app class's base from :class:`~clams.app.ClamsApp`
-   to :class:`~clams.app.ClamsPromptableApp` and implement
-   :meth:`~clams.app.ClamsPromptableApp.generate`. The scaffold file already
-   contains a guiding comment at the class declaration line.
-2. In ``metadata.py``, call
-   :meth:`~clams.app.ClamsPromptableApp.inject_promptable_parameters` at the
-   end of ``appmetadata()``. The scaffold file already contains a
-   commented-out helper-call block; uncomment it.
-
-The ``__main__`` block in ``metadata.py`` does NOT change; it stays identical
-to non-promptable apps.
-
-The helper call inside ``appmetadata()`` makes the promptable parameters
-visible to both ``python metadata.py`` (build-time discovery) and to
-:meth:`~clams.app.ClamsApp._load_appmetadata` (runtime). The base class
-change ensures the app inherits the parameter-presence validation, the
-abstract ``generate()`` contract, and the helper methods at runtime.
+   to one of the promptable base classes. For a non-HF backend (remote API,
+   custom local server, etc.), use :class:`~clams.app.ClamsPromptableApp` and
+   implement :meth:`~clams.app.ClamsPromptableApp.generate`. For a local
+   HuggingFace ``transformers`` model, use
+   :class:`~clams.app.ClamsHFPromptableApp` instead and declare the model via
+   class attributes (no ``generate()`` override needed); see
+   :ref:`hf-promptable` for details. The scaffold file already contains a
+   guiding comment at the class declaration line.
+2. In ``metadata.py``, call ``inject_promptable_parameters`` at the
+   end of ``appmetadata()``. For a plain
+   :class:`~clams.app.ClamsPromptableApp` subclass, call
+   :meth:`ClamsPromptableApp.inject_promptable_parameters
+   <clams.app.ClamsPromptableApp.inject_promptable_parameters>`.
+   For a :class:`~clams.app.ClamsHFPromptableApp` subclass, set
+   ``analyzer_versions={<hf-id>: <commit-hash>, ...}`` on the
+   ``AppMetadata`` constructor call and call
+   :meth:`ClamsHFPromptableApp.inject_promptable_parameters
+   <clams.app.ClamsHFPromptableApp.inject_promptable_parameters>`
+   (the HF override that adds the ``model`` parameter on top of
+   the plain set). The scaffold ``metadata.py`` contains
+   commented-out blocks for both variants; uncomment the one
+   matching the base class chosen in step 1.
+
+The ``__main__`` block in ``metadata.py`` is unchanged from non-promptable
+apps. The helper call inside ``appmetadata()`` makes the promptable
+parameters visible to both ``python metadata.py`` (build-time discovery)
+and to :meth:`~clams.app.ClamsApp._load_appmetadata` (runtime). The base
+class change ensures the app inherits the parameter-presence validation,
+the ``generate()`` contract, and the helper methods at runtime.
+
+For a minimal worked HF example, see the class docstring on
+:class:`~clams.app.ClamsHFPromptableApp`.
 
 The ``generate()`` contract
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Subclasses MUST implement :meth:`~clams.app.ClamsPromptableApp.generate`.
-See the method's docstring for the full signature and parameter semantics.
+Subclasses of :class:`~clams.app.ClamsPromptableApp` that wrap a backend
+without a default SDK implementation (e.g., remote-API or custom local
+backends) MUST implement :meth:`~clams.app.ClamsPromptableApp.generate`.
+Subclasses of :class:`~clams.app.ClamsHFPromptableApp` inherit a concrete
+``generate()`` and do not need to override it. See the method's docstring
+for the full signature, batch semantics, and return value.
 
-The return value is a flat ``List[str]`` with one entry per prompt in the
-batch: the outer length of ``images`` (and/or ``audios``) determines N;
-``generate()`` returns ``N`` strings. For text-only single-shot calls
-(both ``images`` and ``audios`` ``None``), the return is a singleton list.
-Keep inference logic inside ``generate()`` distinct from MMIF I/O; the latter
-belongs in ``_annotate()`` (which calls ``self.generate()``).
-
-This separation is intentional: future SDK releases may provide default
-implementations of ``generate()`` for common backends, at which point apps
-that kept inference and annotation creation separate will need no changes.
+Keep inference logic inside ``generate()`` distinct from MMIF I/O; the
+latter belongs in ``_annotate()`` (which calls ``self.generate()``).
+This separation lets HF-backed apps inherit the default ``generate()``
+without restating backend mechanics, and lets non-HF apps swap in a new
+``generate()`` without rewriting their MMIF I/O.
 
 .. _promptable-multiturn:
 
@@ -371,61 +376,17 @@ HuggingFace transformers (``clams.backends.hf``)
 
 :func:`clams.backends.hf.load_hf_model` loads any local HuggingFace
 ``transformers`` model via ``from_pretrained()`` and returns it ready
-for inference. It encapsulates the device, processor/tokenizer, and
-inference-mode boilerplate that every HF-backed app needs to do
-identically:
-
-- detects an available CUDA device and falls back to CPU when none is
-  present
-- loads the caller-supplied ``processor_cls`` (defaults to
-  :class:`~transformers.AutoProcessor`; pass
-  :class:`~transformers.AutoTokenizer`,
-  :class:`~transformers.AutoImageProcessor`, etc. for narrower or
-  more specific cases)
-- loads the model via the caller-supplied ``model_cls``
-- moves the model to the resolved device and switches it to ``eval()``
-  mode
-- when ``padding_side`` is given (decoder-only / batched-generation
-  case), configures the tokenizer's padding side and uses the EOS
-  token as the pad token; left as the model's own default otherwise
-
-The function signature is::
-
-    load_hf_model(
-        model_id: str,
-        model_cls,                              # e.g. AutoModelForCausalLM, AutoModelForImageTextToText, ConvNextV2Model, ViTModel, ...
-        processor_cls = None,                   # default AutoProcessor; pass AutoTokenizer / AutoImageProcessor / ... for narrower cases, or None to skip processor loading
-        dtype = None,                           # None leaves the model's own default (typically float32); set explicitly (e.g., torch.bfloat16) for LLMs
-        device: Optional[str] = None,           # auto-detected when None
-        padding_side: Optional[str] = None,     # set to 'left' for decoder-only batched generation; leave None for encoder / non-batched cases
-        model_kwargs: Optional[dict] = None,    # extra kwargs forwarded to model_cls.from_pretrained()
-        processor_kwargs: Optional[dict] = None,  # extra kwargs forwarded to processor_cls.from_pretrained()
-    ) -> Tuple[processor, model, device_str]
-
-The ``model_kwargs`` and ``processor_kwargs`` pass-throughs cover the
-common ``from_pretrained()`` options that vary between model classes
-and use cases: ``use_safetensors``, ``use_fast``,
-``add_pooling_layer``, ``trust_remote_code``, ``revision``, etc.
-
-An app's ``__init__`` typically calls this helper once and stores the
-returned ``processor`` (or ``tokenizer`` / ``image_processor``),
-``model``, and ``device`` on ``self`` for use inside its inference
-method (e.g., :meth:`~clams.app.ClamsPromptableApp.generate`). See the
-function's docstring for the full parameter reference and return
-value.
-
-Promptable apps wrapping a decoder-only / chat-tuned model typically
-pass ``padding_side='left'`` and an explicit dtype like
-``torch.bfloat16``; encoder-side HF apps (e.g., a vision feature
-extractor + classifier head) leave both at the defaults and pass any
-class-specific kwargs through ``model_kwargs`` /
-``processor_kwargs``.
-
-For promptable apps specifically, the
-:class:`~clams.app.ClamsHFPromptableApp` base class (see
-:ref:`hf-promptable`) wraps this helper plus the standard inference
-loop, so most HF-backed VLM/LLM apps don't need to call
-:func:`load_hf_model` directly.
+for inference, encapsulating the device, processor/tokenizer, dtype,
+and inference-mode boilerplate that every HF-backed app does
+identically. An app's ``__init__`` typically calls it once and stores
+the returned ``(processor, model, device)`` triple on ``self`` for
+later inference. See :func:`~clams.backends.hf.load_hf_model` for the
+full parameter reference, defaults, and pass-through kwargs.
+
+For promptable apps specifically,
+:class:`~clams.app.ClamsHFPromptableApp` (see :ref:`hf-promptable`)
+wraps this helper plus the standard inference loop, so most HF-backed
+VLM/LLM apps do not call :func:`load_hf_model` directly.
 
 Installation
 ~~~~~~~~~~~~
@@ -478,10 +439,10 @@ non-HF local backend, inherit from
 Class-attribute hooks
 ^^^^^^^^^^^^^^^^^^^^^
 
-Concrete subclasses declare the model declaratively via class
-attributes; the base ``__init__`` reads them, calls
-:func:`load_hf_model`, and stores ``self.processor``, ``self.model``,
-``self.device``:
+Concrete subclasses declare the model class plus optional dtype /
+padding hints via class attributes, and declare the family of
+supported model variants (with pinned commits) via
+``analyzer_versions`` in ``metadata.py``:
 
 .. list-table::
    :header-rows: 1
@@ -490,9 +451,6 @@ attributes; the base ``__init__`` reads them, calls
    * - Attribute
      - Meaning
      - Required
-   * - ``MODEL_ID``
-     - HuggingFace model identifier (Hub repo name or local path).
-     - yes
    * - ``MODEL_CLS``
      - ``transformers`` model class (e.g.
        :class:`~transformers.AutoModelForImageTextToText`,
@@ -517,54 +475,61 @@ attributes; the base ``__init__`` reads them, calls
        ``trust_remote_code=True``).
      - no
 
+The HF model identifiers themselves are NOT a class attribute. They
+live in ``metadata.py`` as ``analyzer_versions``, a
+``Dict[str, str]`` mapping each supported model id to its pinned
+commit hash. The SDK auto-derives a ``model`` runtime parameter
+from this dict, with ``choices`` set to the dict keys.
+
+Family / singleton handling
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When ``analyzer_versions`` contains a single entry (the typical
+single-model app), the SDK eagerly pre-loads that one model in
+``__init__`` and sets ``model.default`` to the only key so callers
+can omit the parameter. Single-model apps thus preserve warm-start
+semantics: the model is loaded at app startup, not on first request.
+
+When ``analyzer_versions`` contains multiple entries (a family app),
+loading is deferred until the first :py:meth:`load_model` call inside
+``_annotate``, and ``model`` has no default by default -- callers
+must pick a family member explicitly (or the dev mutates
+``model.default`` post-injection to provide a recommended pick).
+Loaded models are cached per ``(model_id, revision)`` for the
+lifetime of the app instance; switching models loads on first miss,
+cache-hits on repeat.
+
+Reproducibility: ``model`` refinement and view metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The user-facing ``model`` parameter accepts raw HF model ids
+(``org/repo-name``). The SDK's
+:py:meth:`~clams.app.ClamsHFPromptableApp._refine_params` expands the
+raw value to ``org/repo-name@<revision>`` form (using the dict
+lookup) during parameter refinement. The standard ``sign_view`` flow
+then stamps:
+
+- the **raw** user choice into ``view.metadata.parameters['model']``
+  (transparency: what the user typed),
+- the **resolved** ``org/repo-name@<revision>`` into
+  ``view.metadata.appConfiguration['model']`` (reproducibility: the
+  exact commit applied).
+
+A consumer of the output MMIF can read the resolved revision directly
+from the view metadata, with no cross-reference to the app metadata
+required.
+
 What the base class provides
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- A default :py:meth:`~clams.app.ClamsHFPromptableApp.__init__` that
-  loads the model from the class attributes via
-  :func:`load_hf_model`.
-- A concrete :py:meth:`~clams.app.ClamsHFPromptableApp.generate` that
-  satisfies the :class:`~clams.app.ClamsPromptableApp` abstract
-  contract. Takes ``images`` / ``audios`` as ``List[List[Any]]``
-  (N groups, one per prompt) and runs all N prompts in one HF
-  forward pass; returns one decoded string per group. Apps call
-  this from ``_annotate`` to run their inference; per-image
-  broadcast is a singleton-wrap (``images=[[img] for img in
-  images]``), per-TF composite is one group of N images per TF.
-- A default
-  :py:meth:`~clams.app.ClamsHFPromptableApp.build_gen_kwargs` that
-  maps SDK promptable parameters (``maxNewTokens``, ``temperature``,
-  ``topP``, ``topK``) into HF ``model.generate()`` kwargs.
-  Subclasses may override to add model-specific kwargs
-  (``num_beams``, ``repetition_penalty``, custom stopping criteria,
-  etc.).
-
-Minimal subclass example
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code-block:: python
-
-    from transformers import AutoModelForImageTextToText
-    import torch
-
-    from clams.app import ClamsHFPromptableApp
-
-
-    class MyVLMCaptioner(ClamsHFPromptableApp):
-        MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-        MODEL_CLS = AutoModelForImageTextToText
-        DTYPE = torch.bfloat16
-        PADDING_SIDE = 'left'
-
-        def _appmetadata(self):
-            pass  # defined in metadata.py
-
-        def _annotate(self, mmif, **parameters):
-            ...  # collect tasks from MMIF, build image groups, call
-                 # self.generate(prompt, images=image_groups, ...), then
-                 # store responses via self.response_to_grounded_textdocument
-
-The ``metadata.py`` for an :class:`~clams.app.ClamsHFPromptableApp`
-subclass is identical to a plain
-:class:`~clams.app.ClamsPromptableApp` -- the helper-call requirement
-and the parameter table are unchanged.
+A subclass typically only writes ``_annotate()``. The base class
+supplies model loading and caching
+(:py:meth:`~clams.app.ClamsHFPromptableApp.load_model`), the parameter
+injector (:py:meth:`ClamsHFPromptableApp.inject_promptable_parameters
+<clams.app.ClamsHFPromptableApp.inject_promptable_parameters>`),
+a concrete batched HF :py:meth:`~clams.app.ClamsHFPromptableApp.generate`,
+and a default
+:py:meth:`~clams.app.ClamsHFPromptableApp.build_gen_kwargs` mapping
+the SDK promptable parameters to HF ``model.generate()`` kwargs. See
+each method's docstring for full details.
+
diff --git a/documentation/introduction.rst b/documentation/introduction.rst
index f2d420f..96435c5 100644
--- a/documentation/introduction.rst
+++ b/documentation/introduction.rst
@@ -72,7 +72,7 @@ As a developer you can expose different behaviors of the ``annotate()`` method b
   These runtime configurations are not part of the MMIF input, but for reproducible analysis, you should record these configurations in the output MMIF. 
 
 .. note::
-  Some runtime parameters are managed by the SDK itself rather than declared per-app. The *universal* parameters in :const:`clams.app.ClamsApp.universal_parameters` are one such set — they are auto-added to every CLAMS app. Specialized base classes (see below) add their own SDK-managed parameter sets on top.
+  Some runtime parameters are managed by the SDK itself rather than declared per-app. The *universal* parameters in :const:`clams.app.ClamsApp.universal_parameters` are one such set; they are auto-added to every CLAMS app. Specialized base classes (see below) add their own SDK-managed parameter sets on top.
 
 .. warning::
   All the runtime configurations should be pre-announced in the app metadata.
diff --git a/documentation/runtime-params.rst b/documentation/runtime-params.rst
index cb8d65a..146102e 100644
--- a/documentation/runtime-params.rst
+++ b/documentation/runtime-params.rst
@@ -195,18 +195,14 @@ expected format in the parameter's ``description`` field.
 Promptable apps: an extra SDK-managed parameter set
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-For apps that wrap an **instruction- or chat-tuned** promptable model — an
-LLM or other multimodal model, local or remote — we recommend inheriting
-from :class:`~clams.app.ClamsPromptableApp` instead of
-:class:`~clams.app.ClamsApp`. The promptable base class adds a standardized,
-SDK-managed set of runtime parameters (``prompt``, ``systemPrompt``,
-``temperature``, ``maxNewTokens``, ``topP``, ``topK``, ``promptMode``,
-``parallelPrompts``) on top of the universal parameters. If you use this base
-class, these names are reserved — your app's ``metadata.py`` must not
-redeclare them — and are added via a single helper call inside
-``appmetadata()``.
-
-See :ref:`promptable` for the full developer guide.
+For apps that wrap an **instruction- or chat-tuned** promptable model
+(an LLM or other multimodal model, local or remote), inherit from
+:class:`~clams.app.ClamsPromptableApp` instead of
+:class:`~clams.app.ClamsApp`. The promptable base class adds a
+standardized SDK-managed parameter set on top of the universal
+parameters; the names are reserved and are added via a single helper
+call inside ``appmetadata()``. See :ref:`promptable` for the full
+developer guide and parameter list.
 
 .. _runtime-params-envelope-note:
 
diff --git a/documentation/tutorial.md b/documentation/tutorial.md
index 58ae746..e5d2698 100644
--- a/documentation/tutorial.md
+++ b/documentation/tutorial.md
@@ -228,20 +228,20 @@ First, with `text_value` we get the text from the text document, either from its
 
 ## Working with TimeFrame Annotations
 
-Many CLAMS apps process video by operating on TimeFrame annotations produced by an upstream app (e.g., scene detection, shot segmentation). A TimeFrame can carry structural members (currently called `targets` — a list of TimePoint IDs covering every frame in the segment), a salient subset of those members (currently called `representatives`), or simply `start`/`end` boundaries.
+Many CLAMS apps process video by operating on TimeFrame annotations produced by an upstream app (e.g., scene detection, shot segmentation). A TimeFrame can carry structural members (currently called `targets`; a list of TimePoint IDs covering every frame in the segment), a salient subset of those members (currently called `representatives`), or simply `start`/`end` boundaries.
 
 > **Note**
 > The property names `targets` and `representatives` are under review and may be renamed in a future MMIF spec version. See [mmif#238](https://github.com/clamsproject/mmif/issues/238) for the ongoing discussion. The SDK API will be updated accordingly.
 
 ### Frame sampling with `tfSamplingMode`
 
-When your app receives TimeFrame annotations, the caller can control which frames your app processes by setting the `tfSamplingMode` runtime parameter. This is a **universal parameter** — automatically available on every CLAMS app without any per-app configuration.
+When your app receives TimeFrame annotations, the caller can control which frames your app processes by setting the `tfSamplingMode` runtime parameter. This is a **universal parameter**: automatically available on every CLAMS app without any per-app configuration.
 
 There are three modes:
 
-- `representatives` (default) — use the frames listed in the TimeFrame's `representatives` property. If no representatives exist, the TimeFrame is skipped.
-- `single` — pick one frame: the middle representative if available, otherwise the midpoint of the start/end interval.
-- `all` — use every frame in `targets` if present, otherwise generate every frame in the start/end interval.
+- `representatives` (default): use the frames listed in the TimeFrame's `representatives` property. If no representatives exist, the TimeFrame is skipped.
+- `single`: pick one frame: the middle representative if available, otherwise the midpoint of the start/end interval.
+- `all`: use every frame in `targets` if present, otherwise generate every frame in the start/end interval.
 
 App developers do **not** need to handle this parameter themselves. The SDK intercepts it in `annotate()` and sets a context variable before `_annotate()` runs. Inside `_annotate()`, calls to `vdh.extract_frames_by_mode()` automatically read the active mode and select frames accordingly. The underlying per-mode functions (`_sample_representatives()`, `_sample_single()`, `_sample_all()`) in `mmif.utils.video_document_helper` are also available for apps that need frame numbers without extracting images.
 
diff --git a/tests/test_promptable.py b/tests/test_promptable.py
index 5426c14..44a8fe7 100644
--- a/tests/test_promptable.py
+++ b/tests/test_promptable.py
@@ -19,30 +19,45 @@
 # Test infrastructure
 # ---------------------------------------------------------------------------
 
-def make_metadata(call_helper=True, pre_declare=None):
+def make_metadata(call_helper=True, pre_declare=None,
+                  analyzer_versions=None, hf_helper=False):
     """
     Build a fresh AppMetadata for tests.
 
     :param call_helper: if True, calls
         ``ClamsPromptableApp.inject_promptable_parameters(metadata)``
         at the end (simulating a correctly-written ``appmetadata()``).
+        Mutually exclusive with ``hf_helper``.
     :param pre_declare: if set to a parameter spec dict, calls
         ``metadata.add_parameter(**pre_declare)`` BEFORE the helper
         runs — used to test reservation enforcement.
+    :param analyzer_versions: if set, passed through to
+        ``AppMetadata(analyzer_versions=...)``. Required when the
+        fixture is consumed by ``ClamsHFPromptableApp`` tests.
+    :param hf_helper: if True, calls
+        ``ClamsHFPromptableApp.inject_promptable_parameters(metadata)``
+        (the HF override of the plain promptable helper). Use for HF
+        fixture builds.
     """
-    m = AppMetadata(
+    kwargs = dict(
         name="Example Promptable App",
         description="Test fixture, creating input TD - output TD alignment",
         app_license="MIT",
         identifier="https://apps.clams.ai/example-promptable/v1",
         url="https://fakegithub.com/some/repository",
     )
+    if analyzer_versions is not None:
+        kwargs['analyzer_versions'] = analyzer_versions
+    m = AppMetadata(**kwargs)
     m.add_input(DocumentTypes.TextDocument)
     m.add_output(DocumentTypes.TextDocument)
     m.add_output(AnnotationTypes.Alignment)
     if pre_declare is not None:
         m.add_parameter(**pre_declare)
-    if call_helper:
+    if hf_helper:
+        from clams.app import ClamsHFPromptableApp
+        ClamsHFPromptableApp.inject_promptable_parameters(m)
+    elif call_helper:
         ClamsPromptableApp.inject_promptable_parameters(m)
     return m
 
@@ -398,66 +413,172 @@ class TestHFPromptableAppClassAttrs(unittest.TestCase):
     End-to-end inference tests live separately.
     """
 
-    def _make_subclass(self, *, model_id=None, model_cls=None, **extra_attrs):
+    SINGLETON_AV = {'org/fake-model': 'deadbee'}
+    MULTI_AV = {
+        'org/large-model': 'aaaaaaa',
+        'org/small-model': 'bbbbbbb',
+    }
+
+    def _make_subclass(
+            self, *, model_cls=object,
+            analyzer_versions=None, **extra_attrs):
+        if analyzer_versions is None:
+            analyzer_versions = dict(self.SINGLETON_AV)
         attrs = {
-            '_load_appmetadata': lambda self: make_metadata(call_helper=True),
+            '_load_appmetadata': lambda self: make_metadata(
+                hf_helper=True,
+                analyzer_versions=dict(analyzer_versions),
+            ),
             '_appmetadata': lambda self: None,
             '_annotate': lambda self, mmif, **kw: mmif,
-            'MODEL_ID': model_id,
             'MODEL_CLS': model_cls,
         }
         attrs.update(extra_attrs)
         from clams.app import ClamsHFPromptableApp
         return type('TestHFApp', (ClamsHFPromptableApp,), attrs)
 
-    def test_missing_model_id_raises(self):
-        cls = self._make_subclass(model_id=None, model_cls=object)
+    def test_missing_model_cls_raises(self):
+        cls = self._make_subclass(model_cls=None)
         with self.assertRaises(ValueError) as ctx:
             cls()
-        self.assertIn('MODEL_ID', str(ctx.exception))
+        self.assertIn('MODEL_CLS', str(ctx.exception))
 
-    def test_missing_model_cls_raises(self):
-        cls = self._make_subclass(model_id='fake-id', model_cls=None)
+    def test_missing_analyzer_versions_raises(self):
+        # Use the plain promptable helper so promptable params are
+        # injected (parent __init__ passes) but analyzer_versions is
+        # absent and ``model`` was never injected. HF __init__ should
+        # refuse on the analyzer_versions check.
+        from clams.app import ClamsHFPromptableApp
+        cls = type('TestHFAppBad', (ClamsHFPromptableApp,), {
+            '_load_appmetadata': lambda self: make_metadata(
+                call_helper=True),  # plain promptable, no analyzer_versions
+            '_appmetadata': lambda self: None,
+            '_annotate': lambda self, mmif, **kw: mmif,
+            'MODEL_CLS': object,
+        })
         with self.assertRaises(ValueError) as ctx:
             cls()
-        self.assertIn('MODEL_CLS', str(ctx.exception))
+        self.assertIn('analyzer_versions', str(ctx.exception))
 
-    def test_loads_via_load_hf_model_with_class_attrs(self):
+    def _patch_load(self):
         """
-        Patches ``clams.backends.hf.load_hf_model`` and verifies the
-        base ``__init__`` forwards the declared class attributes to it.
+        Context-manager-ish helper that swaps in a fake ``load_hf_model``
+        recording every call. Returns ``(restore_fn, calls_list)``.
         """
         import clams.backends.hf as hf_module
         original = hf_module.load_hf_model
-        captured = {}
+        calls = []
 
         def fake_load(model_id, model_cls, **kwargs):
-            captured['model_id'] = model_id
-            captured['model_cls'] = model_cls
-            captured.update(kwargs)
-            return ('FAKE_PROCESSOR', 'FAKE_MODEL', 'cpu')
+            calls.append({'model_id': model_id, 'model_cls': model_cls, **kwargs})
+            # processor / model / device tuple uniquely identifiable
+            return (f'PROC:{model_id}@{kwargs.get("revision")}',
+                    f'MODEL:{model_id}@{kwargs.get("revision")}',
+                    'cpu')
 
+        hf_module.load_hf_model = fake_load
+        return (lambda: setattr(hf_module, 'load_hf_model', original)), calls
+
+    def test_singleton_eagerly_preloads_in_init(self):
+        restore, calls = self._patch_load()
         try:
-            hf_module.load_hf_model = fake_load
             cls = self._make_subclass(
-                model_id='org/fake-model',
-                model_cls=object,
+                analyzer_versions=self.SINGLETON_AV,
                 DTYPE='FAKE_DTYPE',
                 PADDING_SIDE='left',
                 MODEL_KWARGS={'trust_remote_code': True},
             )
             app = cls()
-            self.assertEqual(app.processor, 'FAKE_PROCESSOR')
-            self.assertEqual(app.model, 'FAKE_MODEL')
-            self.assertEqual(app.device, 'cpu')
-            self.assertEqual(captured['model_id'], 'org/fake-model')
-            self.assertIs(captured['model_cls'], object)
-            self.assertEqual(captured['dtype'], 'FAKE_DTYPE')
-            self.assertEqual(captured['padding_side'], 'left')
+            # eager load on the single family member
+            self.assertEqual(len(calls), 1)
+            self.assertEqual(calls[0]['model_id'], 'org/fake-model')
+            self.assertEqual(calls[0]['revision'], 'deadbee')
+            self.assertEqual(calls[0]['dtype'], 'FAKE_DTYPE')
+            self.assertEqual(calls[0]['padding_side'], 'left')
             self.assertEqual(
-                captured['model_kwargs'], {'trust_remote_code': True})
+                calls[0]['model_kwargs'], {'trust_remote_code': True})
+            # self.processor / self.model / self.device populated
+            self.assertEqual(app.processor, 'PROC:org/fake-model@deadbee')
+            self.assertEqual(app.model, 'MODEL:org/fake-model@deadbee')
+            self.assertEqual(app.device, 'cpu')
+        finally:
+            restore()
+
+    def test_multimember_defers_loading(self):
+        restore, calls = self._patch_load()
+        try:
+            cls = self._make_subclass(analyzer_versions=self.MULTI_AV)
+            app = cls()
+            # no eager load for multi-member families
+            self.assertEqual(calls, [])
+            self.assertIsNone(app.processor)
+            self.assertIsNone(app.model)
+            self.assertIsNone(app.device)
+        finally:
+            restore()
+
+    def test_load_model_parses_at_revision_form_and_caches(self):
+        restore, calls = self._patch_load()
+        try:
+            cls = self._make_subclass(analyzer_versions=self.MULTI_AV)
+            app = cls()
+            # first call -- load via load_hf_model
+            app.load_model('org/large-model@aaaaaaa')
+            self.assertEqual(len(calls), 1)
+            self.assertEqual(calls[0]['model_id'], 'org/large-model')
+            self.assertEqual(calls[0]['revision'], 'aaaaaaa')
+            self.assertEqual(app.processor, 'PROC:org/large-model@aaaaaaa')
+            # second call same model -- cache hit, no new load
+            app.load_model('org/large-model@aaaaaaa')
+            self.assertEqual(len(calls), 1)
+            # switch to other family member -- new load
+            app.load_model('org/small-model@bbbbbbb')
+            self.assertEqual(len(calls), 2)
+            self.assertEqual(calls[1]['model_id'], 'org/small-model')
+            self.assertEqual(calls[1]['revision'], 'bbbbbbb')
+            self.assertEqual(app.processor, 'PROC:org/small-model@bbbbbbb')
+            # back to first -- still cached
+            app.load_model('org/large-model@aaaaaaa')
+            self.assertEqual(len(calls), 2)
+            self.assertEqual(app.processor, 'PROC:org/large-model@aaaaaaa')
+        finally:
+            restore()
+
+    def test_load_model_accepts_raw_form_looks_up_revision(self):
+        restore, calls = self._patch_load()
+        try:
+            cls = self._make_subclass(analyzer_versions=self.MULTI_AV)
+            app = cls()
+            app.load_model('org/small-model')  # no @rev suffix
+            self.assertEqual(calls[0]['model_id'], 'org/small-model')
+            self.assertEqual(calls[0]['revision'], 'bbbbbbb')
+        finally:
+            restore()
+
+    def test_refine_params_expands_modelid_to_at_revision(self):
+        restore, _ = self._patch_load()
+        try:
+            cls = self._make_subclass(analyzer_versions=self.MULTI_AV)
+            app = cls()
+            refined = app._refine_params(
+                prompt=['hi'],
+                model=['org/large-model'],
+            )
+            self.assertEqual(refined['model'], 'org/large-model@aaaaaaa')
+        finally:
+            restore()
+
+    def test_singleton_default_lets_user_omit_modelid(self):
+        restore, _ = self._patch_load()
+        try:
+            cls = self._make_subclass(analyzer_versions=self.SINGLETON_AV)
+            app = cls()
+            # No model in input -- SDK fills in the singleton default,
+            # then our override expands it.
+            refined = app._refine_params(prompt=['hi'])
+            self.assertEqual(refined['model'], 'org/fake-model@deadbee')
         finally:
-            hf_module.load_hf_model = original
+            restore()
 
 
 if __name__ == '__main__':

From 9c16e16edef8d9608589777676fc05c36f61c5ad Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Fri, 29 May 2026 16:06:22 -0400
Subject: [PATCH 09/10] added pipeline support in the templated TF util

---
 clams/backends/hf.py              | 108 ++++++++++++++++++--
 documentation/app-baseclasses.rst |  32 ++++--
 tests/test_backends_hf.py         | 158 +++++++++++++++++++++++++++++-
 3 files changed, 275 insertions(+), 23 deletions(-)

diff --git a/clams/backends/hf.py b/clams/backends/hf.py
index a92df58..b5f9aef 100644
--- a/clams/backends/hf.py
+++ b/clams/backends/hf.py
@@ -1,11 +1,16 @@
 """
-HuggingFace transformers backend helper.
+HuggingFace transformers backend helpers.
 
-Provides :func:`load_hf_model`, a general loader that wraps the device,
-processor, dtype, and inference-mode boilerplate every HF-backed CLAMS
-app does identically. Usable for any model class that supports
-``from_pretrained()``: instruction-tuned LLMs/VLMs, encoder-only
-classifiers, vision/audio feature extractors, etc.
+Two general loaders that wrap the device / kwargs / inference-mode
+boilerplate every HF-backed CLAMS app does identically:
+
+* :func:`load_hf_model` -- ``from_pretrained()`` flow for any model
+  class (instruction-tuned LLMs/VLMs, encoder-only classifiers,
+  vision/audio feature extractors, etc.). Use when the app needs raw
+  access to the underlying model and processor.
+* :func:`load_hf_pipeline` -- task-level :func:`transformers.pipeline`
+  flow (ASR, NER, text classification, zero-shot, etc.). Use when
+  pipeline-level inference is sufficient.
 
 ``torch`` and ``transformers`` are optional dependencies. Install them
 via the ``[hf]`` extra::
@@ -14,10 +19,10 @@
 
 Imports are lazy: this module can be referenced from
 :mod:`clams.app` without triggering an ``ImportError`` on a base
-``clams-python`` install. The :class:`ImportError` only fires when
-:func:`load_hf_model` is actually called without the extras.
+``clams-python`` install. The :class:`ImportError` only fires when a
+loader is actually called without the extras.
 """
-from typing import Any, Optional, Tuple
+from typing import Any, Optional, Tuple, Union
 
 
 def load_hf_model(
@@ -140,3 +145,88 @@ class with ``from_pretrained()``. Defaults to
     model.eval()
 
     return processor, model, resolved_device
+
+
+def load_hf_pipeline(
+        task: str,
+        model_id: str,
+        device: Optional[Union[str, int]] = None,
+        revision: Optional[str] = None,
+        model_kwargs: Optional[dict] = None,
+        pipeline_kwargs: Optional[dict] = None,
+) -> Tuple[Any, Union[str, int]]:
+    """
+    Load a HuggingFace :func:`transformers.pipeline` for ``task`` and
+    return it ready for inference. Wraps the device / revision /
+    kwargs-forwarding boilerplate that every pipeline-backed CLAMS
+    app does identically. Use this for apps wrapping a task-level
+    pipeline (ASR via ``"automatic-speech-recognition"``, NER via
+    ``"token-classification"``, text classification, zero-shot, etc.);
+    use :func:`load_hf_model` instead when the app needs raw access
+    to the underlying model / processor (e.g., for custom chat-template
+    formatting or batched ``generate`` calls).
+
+    :param task: pipeline task string forwarded to
+        :func:`transformers.pipeline` (e.g.,
+        ``"automatic-speech-recognition"``, ``"token-classification"``).
+    :param model_id: HuggingFace model identifier (Hub repo name or
+        local path) forwarded to ``pipeline(model=...)``.
+    :param device: target device. Accepts the string form
+        (``'cuda'``, ``'cpu'``, ``'cuda:0'``) for parity with
+        :func:`load_hf_model`, or the integer form accepted natively
+        by ``pipeline`` (``-1`` for CPU, ``0+`` for GPU index). When
+        ``None`` (default), auto-detects cuda availability and falls
+        back to cpu (string form).
+    :param revision: optional Git revision (commit hash, branch, or
+        tag) on the Hub to pin the download to. Strongly recommended
+        for production; see :func:`load_hf_model` for rationale.
+    :param model_kwargs: extra kwargs forwarded to the underlying
+        ``model.from_pretrained()`` via the
+        ``pipeline(model_kwargs={...})`` channel.
+    :param pipeline_kwargs: extra kwargs forwarded directly to
+        :func:`transformers.pipeline` (e.g. ``generate_kwargs``,
+        ``tokenizer``, ``feature_extractor``, ``batch_size``,
+        ``framework``). ``model``, ``task``, ``device``, ``revision``,
+        and ``model_kwargs`` are owned by this helper -- explicit
+        helper args take precedence if any collide.
+    :returns: ``(pipeline, device)`` tuple. ``device`` is the resolved
+        device the pipeline is on, in the form it was passed (or the
+        auto-resolved string form when ``device=None``).
+    :rtype: Tuple[Any, Union[str, int]]
+    :raises ImportError: if ``torch`` or ``transformers`` is not
+        installed. Install the ``[hf]`` extra to fix.
+    """
+    try:
+        import torch  # pytype: disable=import-error
+    except ImportError as e:
+        raise ImportError(
+            "clams.backends.hf requires the `torch` package. "
+            "Install with: pip install clams-python[hf]"
+        ) from e
+    try:
+        from transformers import pipeline  # pytype: disable=import-error
+    except ImportError as e:
+        raise ImportError(
+            "clams.backends.hf requires the `transformers` package. "
+            "Install with: pip install clams-python[hf]"
+        ) from e
+
+    resolved_device = device if device is not None else (
+        'cuda' if torch.cuda.is_available() else 'cpu')
+
+    pipeline_call_kwargs = dict(pipeline_kwargs or {})
+    # Helper-owned keys: explicit args win on collision.
+    for owned in ('task', 'model', 'device'):
+        pipeline_call_kwargs.pop(owned, None)
+    if model_kwargs:
+        pipeline_call_kwargs['model_kwargs'] = dict(model_kwargs)
+    if revision is not None:
+        pipeline_call_kwargs['revision'] = revision
+
+    pipe = pipeline(
+        task,
+        model=model_id,
+        device=resolved_device,
+        **pipeline_call_kwargs,
+    )
+    return pipe, resolved_device
diff --git a/documentation/app-baseclasses.rst b/documentation/app-baseclasses.rst
index 8e3a204..b440de0 100644
--- a/documentation/app-baseclasses.rst
+++ b/documentation/app-baseclasses.rst
@@ -374,19 +374,31 @@ when your app needs the backend.
 HuggingFace transformers (``clams.backends.hf``)
 """"""""""""""""""""""""""""""""""""""""""""""""
 
-:func:`clams.backends.hf.load_hf_model` loads any local HuggingFace
-``transformers`` model via ``from_pretrained()`` and returns it ready
-for inference, encapsulating the device, processor/tokenizer, dtype,
-and inference-mode boilerplate that every HF-backed app does
-identically. An app's ``__init__`` typically calls it once and stores
-the returned ``(processor, model, device)`` triple on ``self`` for
-later inference. See :func:`~clams.backends.hf.load_hf_model` for the
-full parameter reference, defaults, and pass-through kwargs.
+Two loaders cover the two HF-loading conventions a CLAMS app would
+pick between:
+
+:func:`clams.backends.hf.load_hf_model`
+    The ``from_pretrained()`` flow. Returns
+    ``(processor, model, device)`` ready for inference, encapsulating
+    the device, processor/tokenizer, dtype, and inference-mode
+    boilerplate. Use when the app needs raw access to the underlying
+    model and processor (e.g., for custom chat-template formatting or
+    batched ``generate`` calls).
+
+:func:`clams.backends.hf.load_hf_pipeline`
+    The :func:`transformers.pipeline` flow. Returns
+    ``(pipeline, device)`` ready for inference. Use when a task-level
+    pipeline is sufficient (ASR, NER, text classification, zero-shot,
+    etc.). Accepts the same revision pinning and shares the
+    auto-device-detection behavior with :func:`load_hf_model`.
+
+See each function's docstring for the full parameter reference,
+defaults, and pass-through kwargs.
 
 For promptable apps specifically,
 :class:`~clams.app.ClamsHFPromptableApp` (see :ref:`hf-promptable`)
-wraps this helper plus the standard inference loop, so most HF-backed
-VLM/LLM apps do not call :func:`load_hf_model` directly.
+wraps :func:`load_hf_model` plus the standard inference loop, so most
+HF-backed VLM/LLM apps do not call either loader directly.
 
 Installation
 ~~~~~~~~~~~~
diff --git a/tests/test_backends_hf.py b/tests/test_backends_hf.py
index d6ddb2e..df30df5 100644
--- a/tests/test_backends_hf.py
+++ b/tests/test_backends_hf.py
@@ -1,20 +1,35 @@
 """
-Tests for :func:`clams.backends.hf.load_hf_model`.
+Tests for :mod:`clams.backends.hf`.
 
 Exercises the device / dtype / padding-side / kwargs-passthrough
-behavior of the helper against mocked ``transformers`` model and
-processor classes.
+behavior of both :func:`load_hf_model` and :func:`load_hf_pipeline`
+against mocked ``transformers`` model, processor, and pipeline
+constructors.
 
 If ``torch`` is not installed, the whole file is skipped (it is an
 optional dep behind the ``[hf]`` extra).
 """
 import unittest
+from unittest import mock
 
 import pytest
 
 pytest.importorskip('torch')
+pytest.importorskip('transformers')
 
-from clams.backends.hf import load_hf_model  # noqa: E402
+# Force ``transformers.pipeline`` to be eagerly resolved into the
+# package's ``__dict__``. ``transformers`` uses a lazy-loading
+# ``_LazyModule`` that fetches submodule attributes via
+# ``__getattr__`` on first access; before that, the attribute does
+# not live in ``__dict__``. The first ``mock.patch('transformers.pipeline', ...)``
+# call would then silently fail to redirect ``from transformers import pipeline``
+# inside the helper. Touching the attribute here resolves it and
+# caches it in the package dict, so subsequent ``mock.patch`` calls
+# rewrite the real entry as expected.
+import transformers  # noqa: E402
+_ = transformers.pipeline
+
+from clams.backends.hf import load_hf_model, load_hf_pipeline  # noqa: E402
 
 
 # ---------------------------------------------------------------------------
@@ -244,5 +259,140 @@ def test_explicit_device_honored(self):
         self.assertEqual(model.device, 'cpu')
 
 
+# ---------------------------------------------------------------------------
+# load_hf_pipeline tests
+# ---------------------------------------------------------------------------
+
+class _FakePipeline:
+    """Captures the args/kwargs the helper forwards to
+    ``transformers.pipeline``. Behaves as the returned pipeline object
+    too -- just a tagged callable stand-in."""
+
+    last_args = None
+    last_kwargs = None
+
+    def __init__(self, *args, **kwargs):
+        type(self).last_args = args
+        type(self).last_kwargs = dict(kwargs)
+
+
+def _patch_pipeline():
+    """Patch ``transformers.pipeline`` to record its call and return a
+    ``_FakePipeline`` instance."""
+    _FakePipeline.last_args = None
+    _FakePipeline.last_kwargs = None
+    return mock.patch('transformers.pipeline', _FakePipeline)
+
+
+class TestLoadHFPipelineDefaults(unittest.TestCase):
+    """The default path: just task + model_id."""
+
+    def test_returns_pipeline_and_device(self):
+        with _patch_pipeline():
+            pipe, device = load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny')
+        self.assertIsInstance(pipe, _FakePipeline)
+        self.assertIn(device, ('cpu', 'cuda'))
+
+    def test_task_arrives_first_positional(self):
+        with _patch_pipeline():
+            load_hf_pipeline(
+                'token-classification', 'fake/ner-model')
+        self.assertEqual(_FakePipeline.last_args, ('token-classification',))
+
+    def test_model_id_forwarded_as_model_kwarg(self):
+        with _patch_pipeline():
+            load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny')
+        self.assertEqual(
+            _FakePipeline.last_kwargs.get('model'), 'openai/whisper-tiny')
+
+    def test_no_revision_kwarg_when_not_specified(self):
+        with _patch_pipeline():
+            load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny')
+        self.assertNotIn('revision', _FakePipeline.last_kwargs)
+
+
+class TestLoadHFPipelineDevice(unittest.TestCase):
+    """Device handling: auto-detect, explicit string, explicit int."""
+
+    def test_auto_detect_when_none(self):
+        with _patch_pipeline():
+            _, device = load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny')
+        self.assertIn(device, ('cpu', 'cuda'))
+        # Same value should have been passed to pipeline().
+        self.assertEqual(_FakePipeline.last_kwargs.get('device'), device)
+
+    def test_explicit_string_device_honored(self):
+        with _patch_pipeline():
+            _, device = load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny',
+                device='cpu')
+        self.assertEqual(device, 'cpu')
+        self.assertEqual(_FakePipeline.last_kwargs.get('device'), 'cpu')
+
+    def test_explicit_int_device_honored(self):
+        """``pipeline()`` natively accepts ``-1`` for CPU, ``0+`` for
+        a specific GPU index. The helper passes it through unchanged."""
+        with _patch_pipeline():
+            _, device = load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny',
+                device=-1)
+        self.assertEqual(device, -1)
+        self.assertEqual(_FakePipeline.last_kwargs.get('device'), -1)
+
+
+class TestLoadHFPipelineKwargsPassThrough(unittest.TestCase):
+    """``model_kwargs`` lands inside ``pipeline(model_kwargs={...})``;
+    ``pipeline_kwargs`` is spread directly into the pipeline call."""
+
+    def test_pipeline_kwargs_spread_into_call(self):
+        with _patch_pipeline():
+            load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny',
+                pipeline_kwargs={
+                    'generate_kwargs': {'num_beams': 5},
+                    'batch_size': 8,
+                })
+        kw = _FakePipeline.last_kwargs
+        self.assertEqual(kw.get('generate_kwargs'), {'num_beams': 5})
+        self.assertEqual(kw.get('batch_size'), 8)
+
+    def test_model_kwargs_nested_under_model_kwargs(self):
+        with _patch_pipeline():
+            load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny',
+                model_kwargs={'use_safetensors': True})
+        kw = _FakePipeline.last_kwargs
+        self.assertEqual(kw.get('model_kwargs'),
+                         {'use_safetensors': True})
+
+    def test_revision_forwarded(self):
+        with _patch_pipeline():
+            load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny',
+                revision='abc1234')
+        self.assertEqual(_FakePipeline.last_kwargs.get('revision'), 'abc1234')
+
+    def test_explicit_helper_args_take_precedence(self):
+        """If the caller smuggles ``model`` / ``device`` / ``revision``
+        through ``pipeline_kwargs``, the helper's own args win."""
+        with _patch_pipeline():
+            load_hf_pipeline(
+                'automatic-speech-recognition', 'openai/whisper-tiny',
+                device='cpu', revision='abc1234',
+                pipeline_kwargs={
+                    'model': 'should-be-overridden',
+                    'device': 'should-be-overridden',
+                    'revision': 'should-be-overridden',
+                })
+        kw = _FakePipeline.last_kwargs
+        self.assertEqual(kw['model'], 'openai/whisper-tiny')
+        self.assertEqual(kw['device'], 'cpu')
+        self.assertEqual(kw['revision'], 'abc1234')
+
+
 if __name__ == '__main__':
     unittest.main()

From c2fed094dd15526577f45fae3568634405729347 Mon Sep 17 00:00:00 2001
From: Keigh Rim <keigh.rim@gmail.com>
Date: Fri, 29 May 2026 16:33:49 -0400
Subject: [PATCH 10/10] tightening documentation for new modules

---
 clams/backends/hf.py                          |  23 ++-
 .../templates/app/metadata.py.template        |   9 +
 .../templates/utl-tf/timeframe.py.template    |  10 +
 documentation/app-baseclasses.rst             | 178 ++++++++----------
 documentation/modules.rst                     |   1 +
 tests/test_backends_hf.py                     |  52 +++++
 6 files changed, 167 insertions(+), 106 deletions(-)

diff --git a/clams/backends/hf.py b/clams/backends/hf.py
index b5f9aef..b2dcfab 100644
--- a/clams/backends/hf.py
+++ b/clams/backends/hf.py
@@ -35,6 +35,7 @@ def load_hf_model(
         revision: Optional[str] = None,
         model_kwargs: Optional[dict] = None,
         processor_kwargs: Optional[dict] = None,
+        move_to_device: bool = True,
 ) -> Tuple[Any, Any, str]:
     """
     Load a HuggingFace ``transformers`` model via ``from_pretrained``
@@ -87,12 +88,25 @@ class with ``from_pretrained()``. Defaults to
     :param processor_kwargs: extra kwargs forwarded to
         ``processor_cls.from_pretrained()`` (e.g.,
         ``{'use_safetensors': True, 'use_fast': True}``).
+    :param move_to_device: when ``True`` (default), the helper moves
+        the loaded model to the resolved device and switches it to
+        ``eval()`` mode -- the right behavior for a "ready for
+        inference" app loader. When ``False``, both steps are
+        skipped; the model is returned in the state
+        ``from_pretrained`` left it (on CPU, in train mode). Use
+        ``False`` for library-style HF wrappers that defer device
+        placement and inference-mode switching to a downstream
+        consumer (e.g. an extractor class that may be combined with
+        a head and only then placed on a device by the wrapping
+        classifier). The returned ``device`` is still the resolved
+        target, so the consumer can use it later for its own
+        ``.to(device)`` call.
 
     :returns: ``(processor, model, device)`` tuple. ``processor`` is
         the loaded processor/tokenizer/feature-extractor (or ``None``
         if ``processor_cls`` was explicitly set to ``None``).
-        ``device`` is the resolved device string the model was moved
-        to.
+        ``device`` is the resolved device string (the model was moved
+        there iff ``move_to_device=True``).
     :rtype: Tuple[Any, Any, str]
     :raises ImportError: if ``torch`` or ``transformers`` is not
         installed. Install the ``[hf]`` extra to fix.
@@ -141,8 +155,9 @@ class with ``from_pretrained()``. Defaults to
     if revision is not None:
         model_load_kwargs.setdefault('revision', revision)
     model = model_cls.from_pretrained(model_id, **model_load_kwargs)
-    model = model.to(resolved_device)
-    model.eval()
+    if move_to_device:
+        model = model.to(resolved_device)
+        model.eval()
 
     return processor, model, resolved_device
 
diff --git a/clams/develop/templates/app/metadata.py.template b/clams/develop/templates/app/metadata.py.template
index d34303a..2de03a5 100644
--- a/clams/develop/templates/app/metadata.py.template
+++ b/clams/develop/templates/app/metadata.py.template
@@ -83,6 +83,15 @@ def appmetadata() -> AppMetadata:
     #         p.default = ['Describe what is in this image.']
     #     elif p.name == 'maxNewTokens':
     #         p.default = 2048
+    #
+    # HF-only: the ``model`` parameter the HF helper injects gets its ``default``
+    # auto-set to the only key when ``analyzer_versions`` has a single entry
+    # (singleton family); for multi-member families the default is ``None`` and
+    # the caller MUST pass ``model=...`` on every request. To provide a
+    # recommended pick instead, mutate ``default`` the same way:
+    # for p in metadata.parameters:
+    #     if p.name == 'model':
+    #         p.default = '<org>/<one-of-the-keys-from-analyzer_versions>'
 
     # CHANGE this line and make sure return the compiled `metadata` instance
     return None
diff --git a/clams/develop/templates/utl-tf/timeframe.py.template b/clams/develop/templates/utl-tf/timeframe.py.template
index 8903144..d2c8d7b 100644
--- a/clams/develop/templates/utl-tf/timeframe.py.template
+++ b/clams/develop/templates/utl-tf/timeframe.py.template
@@ -23,6 +23,16 @@ The helpers are backend-agnostic: tasks can feed a HuggingFace VLM, a
 remote LLM API, a classical CV pipeline, or any other per-frame
 processor. They have no dependency on ``clams.app.ClamsPromptableApp``
 or any other promptable / inference machinery.
+
+These functions are scaffolded into each app so individual apps can
+edit them freely while the pattern stabilizes across the ecosystem.
+Once the shape converges across several apps, the helpers are good
+candidates for promotion into a shared package -- either
+``mmif.utils`` (for the pure-MMIF iteration / TP minting pieces, which
+have no clams-app dependency) or ``clams.<helpers>`` (for the
+task-tuple composition that does presuppose the "writing into a new
+view" CLAMS-app idiom). If/when that happens, apps would import the
+shared version and delete this local copy.
 """
 from typing import Any, Iterator, List, Optional, Tuple, Union
 
diff --git a/documentation/app-baseclasses.rst b/documentation/app-baseclasses.rst
index b440de0..6ac5cfc 100644
--- a/documentation/app-baseclasses.rst
+++ b/documentation/app-baseclasses.rst
@@ -241,42 +241,28 @@ effect, though that's rarely useful.
 Declaring a promptable app
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-A promptable app requires two paired edits relative to the scaffold generated
-by ``clams develop``:
-
-1. In ``app.py``, change the app class's base from :class:`~clams.app.ClamsApp`
-   to one of the promptable base classes. For a non-HF backend (remote API,
-   custom local server, etc.), use :class:`~clams.app.ClamsPromptableApp` and
-   implement :meth:`~clams.app.ClamsPromptableApp.generate`. For a local
-   HuggingFace ``transformers`` model, use
-   :class:`~clams.app.ClamsHFPromptableApp` instead and declare the model via
-   class attributes (no ``generate()`` override needed); see
-   :ref:`hf-promptable` for details. The scaffold file already contains a
-   guiding comment at the class declaration line.
-2. In ``metadata.py``, call ``inject_promptable_parameters`` at the
-   end of ``appmetadata()``. For a plain
-   :class:`~clams.app.ClamsPromptableApp` subclass, call
+A promptable app requires two paired edits relative to the scaffold
+generated by ``clams develop``:
+
+1. In ``app.py``, change the app class's base from
+   :class:`~clams.app.ClamsApp` to
+   :class:`~clams.app.ClamsPromptableApp` and implement
+   :meth:`~clams.app.ClamsPromptableApp.generate`. The scaffold file
+   already contains a guiding comment at the class declaration line.
+2. In ``metadata.py``, call
    :meth:`ClamsPromptableApp.inject_promptable_parameters
-   <clams.app.ClamsPromptableApp.inject_promptable_parameters>`.
-   For a :class:`~clams.app.ClamsHFPromptableApp` subclass, set
-   ``analyzer_versions={<hf-id>: <commit-hash>, ...}`` on the
-   ``AppMetadata`` constructor call and call
-   :meth:`ClamsHFPromptableApp.inject_promptable_parameters
-   <clams.app.ClamsHFPromptableApp.inject_promptable_parameters>`
-   (the HF override that adds the ``model`` parameter on top of
-   the plain set). The scaffold ``metadata.py`` contains
-   commented-out blocks for both variants; uncomment the one
-   matching the base class chosen in step 1.
-
-The ``__main__`` block in ``metadata.py`` is unchanged from non-promptable
-apps. The helper call inside ``appmetadata()`` makes the promptable
-parameters visible to both ``python metadata.py`` (build-time discovery)
-and to :meth:`~clams.app.ClamsApp._load_appmetadata` (runtime). The base
-class change ensures the app inherits the parameter-presence validation,
-the ``generate()`` contract, and the helper methods at runtime.
-
-For a minimal worked HF example, see the class docstring on
-:class:`~clams.app.ClamsHFPromptableApp`.
+   <clams.app.ClamsPromptableApp.inject_promptable_parameters>` at
+   the end of ``appmetadata()``. The scaffold file already contains
+   a commented-out helper-call block; uncomment it.
+
+The ``__main__`` block in ``metadata.py`` is unchanged from
+non-promptable apps. The helper call inside ``appmetadata()`` makes
+the promptable parameters visible to both ``python metadata.py``
+(build-time discovery) and to
+:meth:`~clams.app.ClamsApp._load_appmetadata` (runtime). The base
+class change ensures the app inherits the parameter-presence
+validation, the ``generate()`` contract, and the helper methods at
+runtime.
 
 The ``generate()`` contract
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -359,63 +345,6 @@ Helpers
     properties. See https://clams.ai/clams-vocabulary/Document for
     vocabulary semantics.
 
-Backend helpers
-^^^^^^^^^^^^^^^
-
-The SDK provides optional helper utilities for loading common
-inference backends, so apps don't have to write model-loading
-boilerplate themselves. Backends are kept as separate subpackages
-under ``clams.backends`` and their heavy dependencies are NOT pulled
-in by the base ``clams-python`` install; you opt in via a pip extra
-when your app needs the backend.
-
-.. _backends-hf:
-
-HuggingFace transformers (``clams.backends.hf``)
-""""""""""""""""""""""""""""""""""""""""""""""""
-
-Two loaders cover the two HF-loading conventions a CLAMS app would
-pick between:
-
-:func:`clams.backends.hf.load_hf_model`
-    The ``from_pretrained()`` flow. Returns
-    ``(processor, model, device)`` ready for inference, encapsulating
-    the device, processor/tokenizer, dtype, and inference-mode
-    boilerplate. Use when the app needs raw access to the underlying
-    model and processor (e.g., for custom chat-template formatting or
-    batched ``generate`` calls).
-
-:func:`clams.backends.hf.load_hf_pipeline`
-    The :func:`transformers.pipeline` flow. Returns
-    ``(pipeline, device)`` ready for inference. Use when a task-level
-    pipeline is sufficient (ASR, NER, text classification, zero-shot,
-    etc.). Accepts the same revision pinning and shares the
-    auto-device-detection behavior with :func:`load_hf_model`.
-
-See each function's docstring for the full parameter reference,
-defaults, and pass-through kwargs.
-
-For promptable apps specifically,
-:class:`~clams.app.ClamsHFPromptableApp` (see :ref:`hf-promptable`)
-wraps :func:`load_hf_model` plus the standard inference loop, so most
-HF-backed VLM/LLM apps do not call either loader directly.
-
-Installation
-~~~~~~~~~~~~
-
-``torch`` and ``transformers`` are NOT included in the base
-``clams-python`` install (to keep the SDK lightweight for apps that
-don't need them). When your app uses the HF backend, install with the
-``hf`` extra::
-
-    pip install clams-python[hf]
-
-The helper module imports ``torch`` and ``transformers`` lazily, so a
-plain ``clams-python`` install can still import :mod:`clams.app` and
-:class:`~clams.app.ClamsPromptableApp` without those dependencies; the
-``ImportError`` only fires when an app actually calls
-:func:`clams.backends.hf.load_hf_model`.
-
 .. _hf-promptable:
 
 HuggingFace Promptable Apps
@@ -448,6 +377,39 @@ non-HF local backend, inherit from
 :class:`~clams.app.ClamsPromptableApp` directly and implement
 :meth:`~clams.app.ClamsPromptableApp.generate` yourself.
 
+.. _hf-promptable-declaring:
+
+Declaring an HF promptable app
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+On top of the baseline declaration shared by every promptable app
+(see :ref:`promptable-declaration`), a
+:class:`~clams.app.ClamsHFPromptableApp` subclass:
+
+1. Uses :class:`~clams.app.ClamsHFPromptableApp` (not
+   :class:`~clams.app.ClamsPromptableApp`) as the base class in
+   ``app.py``.
+2. Declares the required class attribute ``MODEL_CLS`` and any
+   optional dtype / padding / kwargs hints (see
+   :ref:`hf-promptable-class-attrs` for the full list).
+3. Sets ``analyzer_versions={<hf-id>: <commit-hash>, ...}`` on the
+   ``AppMetadata`` constructor call in ``metadata.py`` (replaces the
+   singular ``analyzer_version`` for HF apps).
+4. Calls
+   :meth:`ClamsHFPromptableApp.inject_promptable_parameters
+   <clams.app.ClamsHFPromptableApp.inject_promptable_parameters>`
+   (the HF override of the plain helper) at the end of
+   ``appmetadata()``. The scaffold ``metadata.py`` contains a
+   commented-out HF block; uncomment it.
+5. Inherits the base class's
+   :meth:`~clams.app.ClamsPromptableApp.generate` implementation;
+   no override needed.
+
+For a minimal worked example, see the class docstring on
+:class:`~clams.app.ClamsHFPromptableApp`.
+
+.. _hf-promptable-class-attrs:
+
 Class-attribute hooks
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -504,7 +466,7 @@ semantics: the model is loaded at app startup, not on first request.
 
 When ``analyzer_versions`` contains multiple entries (a family app),
 loading is deferred until the first :py:meth:`load_model` call inside
-``_annotate``, and ``model`` has no default by default -- callers
+``_annotate``, and ``model`` has no default by default; callers
 must pick a family member explicitly (or the dev mutates
 ``model.default`` post-injection to provide a recommended pick).
 Loaded models are cached per ``(model_id, revision)`` for the
@@ -535,13 +497,25 @@ What the base class provides
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 A subclass typically only writes ``_annotate()``. The base class
-supplies model loading and caching
-(:py:meth:`~clams.app.ClamsHFPromptableApp.load_model`), the parameter
-injector (:py:meth:`ClamsHFPromptableApp.inject_promptable_parameters
-<clams.app.ClamsHFPromptableApp.inject_promptable_parameters>`),
-a concrete batched HF :py:meth:`~clams.app.ClamsHFPromptableApp.generate`,
-and a default
-:py:meth:`~clams.app.ClamsHFPromptableApp.build_gen_kwargs` mapping
-the SDK promptable parameters to HF ``model.generate()`` kwargs. See
-each method's docstring for full details.
+supplies:
+
+* model loading and caching via
+  :py:meth:`~clams.app.ClamsHFPromptableApp.load_model`, which wraps
+  :func:`clams.backends.hf.load_hf_model` (non-promptable HF apps
+  can call that loader directly without going through this base
+  class);
+* the parameter injector
+  :py:meth:`ClamsHFPromptableApp.inject_promptable_parameters
+  <clams.app.ClamsHFPromptableApp.inject_promptable_parameters>`;
+* a concrete batched HF
+  :py:meth:`~clams.app.ClamsHFPromptableApp.generate`;
+* a default
+  :py:meth:`~clams.app.ClamsHFPromptableApp.build_gen_kwargs` that
+  maps the SDK promptable parameters to HF ``model.generate()``
+  kwargs.
+
+See each method's docstring for full details.
+
+Apps using the HF backend (with or without the promptable wrapper)
+must install the ``[hf]`` extra: ``pip install clams-python[hf]``.
 
diff --git a/documentation/modules.rst b/documentation/modules.rst
index 7897b4a..d25e3cd 100644
--- a/documentation/modules.rst
+++ b/documentation/modules.rst
@@ -7,5 +7,6 @@ API documentation
 
    autodoc/clams.app
    autodoc/clams.appmetadata
+   autodoc/clams.backends
    autodoc/clams.restify
    autodoc/clams.mmif_utils
diff --git a/tests/test_backends_hf.py b/tests/test_backends_hf.py
index df30df5..fae696e 100644
--- a/tests/test_backends_hf.py
+++ b/tests/test_backends_hf.py
@@ -259,6 +259,58 @@ def test_explicit_device_honored(self):
         self.assertEqual(model.device, 'cpu')
 
 
+class TestMoveToDeviceFlag(unittest.TestCase):
+    """
+    ``move_to_device=False`` skips both the ``.to(device)`` move and
+    the ``.eval()`` switch, for library-style HF wrappers that defer
+    device placement and inference-mode switching to a downstream
+    consumer.
+    """
+
+    def setUp(self):
+        _MockModel.last_from_pretrained_args = None
+        _MockModel.last_from_pretrained_kwargs = None
+
+    def test_move_skipped_when_flag_false(self):
+        _, model, _ = load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            move_to_device=False,
+        )
+        # _MockModel.__init__ leaves device=None; .to() would set it.
+        self.assertIsNone(model.device)
+
+    def test_eval_skipped_when_flag_false(self):
+        _, model, _ = load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            move_to_device=False,
+        )
+        self.assertFalse(model.eval_called)
+
+    def test_resolved_device_still_returned(self):
+        """Even when not moved, the resolved target is reported so the
+        downstream consumer can use it for its own ``.to(device)``."""
+        _, _, device = load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            device='cpu',
+            move_to_device=False,
+        )
+        self.assertEqual(device, 'cpu')
+
+    def test_default_still_moves_and_evals(self):
+        """Regression guard: the default (omitted) value of the new
+        flag preserves prior behavior."""
+        _, model, _ = load_hf_model(
+            'fake-model-id', _MockModel,
+            processor_cls=_MockProcessor,
+            device='cpu',
+        )
+        self.assertEqual(model.device, 'cpu')
+        self.assertTrue(model.eval_called)
+
+
 # ---------------------------------------------------------------------------
 # load_hf_pipeline tests
 # ---------------------------------------------------------------------------