From a6edb339eeb44c2b89cd482afe0fbf04b6fed8e5 Mon Sep 17 00:00:00 2001 From: theproteinbot Date: Fri, 27 Mar 2026 23:16:17 -0400 Subject: [PATCH] Add CatPred integration scaffold --- Dockerfile | 11 +++++ api/embeddings/registry.py | 6 ++- api/methods/catpred.py | 50 ++++++++++++++++++++ docker-compose.prod.yml | 1 + docker-requirements/catpred_requirements.txt | 17 +++++++ models/CatPred/README.md | 20 ++++++++ webKinPred/config_base.py | 2 + webKinPred/config_docker.py | 8 ++++ webKinPred/config_local.py | 11 +++++ 9 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 api/methods/catpred.py create mode 100644 docker-requirements/catpred_requirements.txt create mode 100644 models/CatPred/README.md diff --git a/Dockerfile b/Dockerfile index a2ae0315..c22f4d5b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -106,6 +106,15 @@ RUN --mount=type=cache,target=/opt/conda/pkgs,sharing=locked \ && mamba install -n catapro_env -c conda-forge rdkit=2024.03.6 -y \ && conda run -n catapro_env pip install -r docker-requirements/catapro_requirements.txt +# ── CatPred ─────────────────────────────────────────────────────────────────── +FROM base AS env-catpred +COPY docker-requirements/catpred_requirements.txt ./docker-requirements/ +RUN --mount=type=cache,target=/opt/conda/pkgs,sharing=locked \ + --mount=type=cache,id=webkinpred-pip-py310,target=/root/.cache/pip,sharing=locked \ + mamba create -n catpred_env python=3.10.15 -c conda-forge -y \ + && mamba install -n catpred_env -c conda-forge rdkit=2024.03.6 -y \ + && conda run -n catpred_env pip install -r docker-requirements/catpred_requirements.txt + # ── pseq2sites ──────────────────────────────────────────────────────────────── FROM base AS env-pseq2sites RUN --mount=type=cache,target=/opt/conda/pkgs,sharing=locked \ @@ -155,6 +164,7 @@ COPY --from=env-eitlem /opt/conda/envs/eitlem_env /opt/conda/envs/eitlem_en COPY --from=env-turnup /opt/conda/envs/turnup_env /opt/conda/envs/turnup_env COPY --from=env-unikp /opt/conda/envs/unikp /opt/conda/envs/unikp COPY --from=env-catapro /opt/conda/envs/catapro_env /opt/conda/envs/catapro_env +COPY --from=env-catpred /opt/conda/envs/catpred_env /opt/conda/envs/catpred_env COPY --from=env-pseq2sites /opt/conda/envs/pseq2sites /opt/conda/envs/pseq2sites COPY --from=env-esm /opt/conda/envs/esm /opt/conda/envs/esm COPY --from=env-esmc /opt/conda/envs/esmc /opt/conda/envs/esmc @@ -174,6 +184,7 @@ RUN find /opt/conda -name "*.pyc" -delete \ COPY . . RUN mkdir -p /app/models/EITLEM/Weights \ + /app/models/CatPred \ /app/models/TurNup/data/saved_models \ /app/models/UniKP-main/models \ /app/media/sequence_info \ diff --git a/api/embeddings/registry.py b/api/embeddings/registry.py index 1a9571a2..0464f438 100644 --- a/api/embeddings/registry.py +++ b/api/embeddings/registry.py @@ -34,11 +34,13 @@ "implemented": True, "conda_env": "esm", "python_path_key": "esm2", # key in config PYTHON_PATHS - "used_by": ["KinForm-H", "KinForm-L"], + "used_by": ["KinForm-H", "KinForm-L", "CatPred"], "notes": ( "Invoked as a subprocess by KinForm. The python path is passed via " "the KINFORM_ESM_PATH environment variable. Multi-layer embeddings " - "are extracted in a single model-load pass." + "are extracted in a single model-load pass. CatPred uses per-residue " + "ESM2 features, so it bridges into a method-specific cache format " + "rather than reusing the shared mean-vector cache directly." ), }, diff --git a/api/methods/catpred.py b/api/methods/catpred.py new file mode 100644 index 00000000..90bca865 --- /dev/null +++ b/api/methods/catpred.py @@ -0,0 +1,50 @@ +# api/methods/catpred.py +# +# Method descriptor for CatPred. + +from api.methods.base import MethodDescriptor, SubprocessEngineConfig + + +descriptor = MethodDescriptor( + key="CatPred", + display_name="CatPred", + authors=( + "Veda Sheersh Boorla, Somtirtha Santra, Costas D. Maranas" + ), + publication_title=( + "CatPred: A comprehensive framework for deep learning in vitro enzyme kinetic parameters" + ), + citation_url="https://www.nature.com/articles/s41467-025-57215-9", + repo_url="https://github.com/maranasgroup/CatPred", + more_info=( + "CatPred currently integrates kcat and Km through a local CPU inference " + "adapter. Ki remains outside the current webKinPred target model." + ), + + supports=["kcat", "Km"], + input_format="single", + output_cols={ + "kcat": "kcat (1/s)", + "Km": "KM (mM)", + }, + max_seq_len=2048, + col_to_kwarg={"Substrate": "substrates"}, + target_kwargs={ + "kcat": {"kinetics_type": "KCAT"}, + "Km": {"kinetics_type": "KM"}, + }, + subprocess=SubprocessEngineConfig( + python_path_key="CatPred", + script_key="CatPred", + data_path_env={ + "CATPRED_REPO_ROOT": "CatPred", + "CATPRED_MEDIA_PATH": "media", + "CATPRED_TOOLS_PATH": "tools", + "PYTHONPATH": "CatPred", + }, + extra_env={ + "PROTEIN_EMBED_USE_CPU": "1", + }, + ), + embeddings_used=["esm2"], +) diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index c8f4aa85..3820a3ab 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -95,6 +95,7 @@ services: - ./models/DLKcat/DeeplearningApproach:/app/models/DLKcat/DeeplearningApproach:ro - ./models/KinForm/results/trained_models:/app/models/KinForm/results/trained_models:ro - ./models/CataPro:/app/models/CataPro:ro + - ./models/CatPred:/app/models/CatPred:ro environment: <<: *common-env depends_on: diff --git a/docker-requirements/catpred_requirements.txt b/docker-requirements/catpred_requirements.txt new file mode 100644 index 00000000..2183cb65 --- /dev/null +++ b/docker-requirements/catpred_requirements.txt @@ -0,0 +1,17 @@ +# Requirements for CatPred (Python 3.10) +torch==2.4.1 +fair-esm==2.0.0 +transformers==4.47.1 +sentencepiece==0.2.0 +numpy==1.26.4 +pandas==2.2.3 +pandas-flavor==0.6.0 +scikit-learn==1.5.2 +scipy==1.14.1 +tqdm==4.67.1 +typed-argument-parser==1.10.1 +descriptastorus==2.8.0 +rotary-embedding-torch==0.6.5 +hyperopt==0.2.7 +matplotlib==3.9.2 +tensorboardX==2.6.2.2 diff --git a/models/CatPred/README.md b/models/CatPred/README.md new file mode 100644 index 00000000..404dab76 --- /dev/null +++ b/models/CatPred/README.md @@ -0,0 +1,20 @@ +# CatPred + +This directory is reserved for the CatPred integration. + +Expected contents: +- a checkout of the CatPred repository rooted at `models/CatPred/` +- the adapter entrypoint at `models/CatPred/catpred/integration/webkinpred_adapter.py` + +Local development can instead point webKinPred at an external CatPred checkout: + +```bash +export WEBKINPRED_CATPRED_ROOT="/absolute/path/to/CatPred" +export WEBKINPRED_CATPRED_PYTHON="/absolute/path/to/CatPred/.venv/bin/python" +``` + +Docker/runtime notes: +- `webKinPred/config_docker.py` defaults to `/app/models/CatPred` +- the CatPred subprocess descriptor sets `CATPRED_REPO_ROOT`, `CATPRED_MEDIA_PATH`, and `CATPRED_TOOLS_PATH` +- CatPred kcat/Km use per-residue ESM2 features and cache them under + `media/sequence_info/esm2_last/per_residue/{seq_id}.pt` diff --git a/webKinPred/config_base.py b/webKinPred/config_base.py index fb9b4930..e8de7518 100644 --- a/webKinPred/config_base.py +++ b/webKinPred/config_base.py @@ -10,6 +10,7 @@ _DATA_PATH_REL = { + "CatPred": "models/CatPred", "DLKcat": "models/DLKcat/DeeplearningApproach/Data", "DLKcat_Results": "models/DLKcat/DeeplearningApproach/Results", "EITLEM": "models/EITLEM", @@ -23,6 +24,7 @@ _PREDICTION_SCRIPT_REL = { + "CatPred": "models/CatPred/catpred/integration/webkinpred_adapter.py", "DLKcat": "models/DLKcat/DeeplearningApproach/Code/example/prediction_for_input.py", "EITLEM": "models/EITLEM/Code/eitlem_prediction_script_batch.py", "TurNup": "models/TurNup/code/kcat_prediction_batch.py", diff --git a/webKinPred/config_docker.py b/webKinPred/config_docker.py index 7664e682..8d431de4 100644 --- a/webKinPred/config_docker.py +++ b/webKinPred/config_docker.py @@ -1,3 +1,5 @@ +import os + from webKinPred.config_base import ( DEFAULT_ALLOWED_FRONTEND_IPS, SERVER_LIMIT, @@ -13,6 +15,7 @@ FASTAS_DIR = f"{BASE_PATH}/fastas" PYTHON_PATHS = { + "CatPred": "/opt/conda/envs/catpred_env/bin/python", "DLKcat": "/opt/conda/envs/dlkcat_env/bin/python", "EITLEM": "/opt/conda/envs/eitlem_env/bin/python", "TurNup": "/opt/conda/envs/turnup_env/bin/python", @@ -28,6 +31,11 @@ DATA_PATHS = build_data_paths(BASE_PATH) PREDICTION_SCRIPTS = build_prediction_scripts(BASE_PATH) +CATPRED_ROOT = os.environ.get("WEBKINPRED_CATPRED_ROOT") +if CATPRED_ROOT: + DATA_PATHS["CatPred"] = CATPRED_ROOT + PREDICTION_SCRIPTS["CatPred"] = f"{CATPRED_ROOT}/catpred/integration/webkinpred_adapter.py" + SIMILARITY_DATASETS = build_similarity_datasets(FASTAS_DIR) TARGET_DBS = {label: item["target_db"] for label, item in SIMILARITY_DATASETS.items()} diff --git a/webKinPred/config_local.py b/webKinPred/config_local.py index f0d1adaf..9c10c06e 100644 --- a/webKinPred/config_local.py +++ b/webKinPred/config_local.py @@ -26,6 +26,7 @@ def _env_python(env_name: str) -> str: PYTHON_PATHS = { + "CatPred": _env_python("catpred_env"), "DLKcat": _env_python("dlkcat_env"), "EITLEM": _env_python("eitlem_env"), "TurNup": _env_python("turnup_env"), @@ -41,6 +42,16 @@ def _env_python(env_name: str) -> str: DATA_PATHS = build_data_paths(BASE_PATH) PREDICTION_SCRIPTS = build_prediction_scripts(BASE_PATH) +CATPRED_ROOT = os.environ.get("WEBKINPRED_CATPRED_ROOT") +if CATPRED_ROOT: + catpred_root = str(Path(CATPRED_ROOT).resolve()) + DATA_PATHS["CatPred"] = catpred_root + PREDICTION_SCRIPTS["CatPred"] = str(Path(catpred_root) / "catpred" / "integration" / "webkinpred_adapter.py") + PYTHON_PATHS["CatPred"] = os.environ.get( + "WEBKINPRED_CATPRED_PYTHON", + _env_python("catpred_env"), + ) + SIMILARITY_DATASETS = build_similarity_datasets(FASTAS_DIR) TARGET_DBS = {label: item["target_db"] for label, item in SIMILARITY_DATASETS.items()}