diff --git a/Dockerfile b/Dockerfile
index 18804b6..9dc59a6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,8 @@
 FROM python:3.12-slim
 
 ENV PYTHONDONTWRITEBYTECODE=1 \
-    PYTHONUNBUFFERED=1
+    PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/app
 
 WORKDIR /app
 
@@ -10,7 +11,11 @@ RUN pip install --no-cache-dir -r /app/requirements.txt
 
 COPY agentic_ran /app/agentic_ran
 COPY scripts /app/scripts
+COPY src /app/src
+COPY models /app/models
+COPY policies /app/policies
+COPY configs /app/configs
 
 RUN mkdir -p /app/results /app/shared_data
 
-CMD ["python", "-m", "scripts.run_scenario", "--scenario", "lightweight-32"]
+CMD ["python", "-m", "src.benchmark", "--benchmark-scope", "main"]
diff --git a/README.md b/README.md
index 9a3b5f7..96fa96c 100644
--- a/README.md
+++ b/README.md
@@ -1,142 +1,42 @@
-# Agentic RAN Traffic-Aware Control Framework
+# Agentic-RAN Benchmark
 
-## Project goal
-This repository now targets an **agentic, traffic-aware RAN control workflow**:
-1. Observe slice-level RAN state from `*_metrics.csv`.
-2. Engineer temporal + traffic-class + schedule-aware context.
-3. Predict throughput (`tx_brate downlink [Mbps]`).
-4. Recommend interpretable control actions (PRB change / scheduler switch).
+This repository benchmarks slice-aware RAN forecasting and safe agentic control.
 
-The proposed method family is **Liquid Dynamics** (represented by `liquid-baseline`) and is benchmarked against:
-- lightweight MLP (`lightweight-32`, `lightweight-64`)
-- balanced MLP (`balanced-small`, `balanced-medium`)
-- deep MLP (`deep-performance`)
-- ultra-performance MLP (`ultra-performance`)
-- attention-based sequence modeling (`attention-baseline`)
-- xLSTM (`xlstm-baseline`)
+## Main benchmark scope
+The default benchmark focuses on:
+- Time-aware tabular/residual forecasting models
+- Strong gradient boosting baseline
+- Graph-aware actor-critic and masked PPO control baselines
+- **SafeGraphAgent-RAN** proposed method
 
-## Why this repository is agentic
-The framework does not stop at KPI prediction. It builds context-aware slice state, selects an action from an explicit action space, and records a human-readable reason and confidence for each decision.
+Appendix temporal models remain available but are excluded from the default main scope.
 
-## Dataset structure and attribution
-### Documented input folder
-Use `dataset/` as the canonical input location for raw CSV data preparation.
-
-Expected layout:
-- `dataset/slice_mixed/**/*_metrics.csv`
-- `dataset/slice_traffic/**/*_metrics.csv`
-
-`prepare_splits.py` accepts `dataset/` directly and recursively scans only `*_metrics.csv` files.
-
-### Tested reference dataset
-Data preparation is tested with the **Colosseum O-RAN COMMAG Dataset** associated with:
-> L. Bonati, S. D'Oro, M. Polese, S. Basagni, T. Melodia, “Intelligence and Learning in O-RAN for Data-driven NextG Cellular Networks,” IEEE Communications Magazine, vol. 59, no. 10, pp. 21–27, October 2021.
-
-Please cite that paper if you use the dataset in a publication.
-
-## Target column and feature handling
-- The prepared benchmark dataset always uses an explicit target column named **`target`**.
-- During preparation, you can explicitly set the raw target column with:
-  ```bash
-  python -m scripts.prepare_splits --target-col "tx_brate downlink [Mbps]"
-  ```
-- If `--target-col` is not set, preparation defaults to `tx_brate downlink [Mbps]` (or `ratio_granted_req` for URLLC experiments via `--target-col ratio_granted_req`).
-- **Actual source feature names are preserved** (no remapping to `feature_0`, `feature_1`, ...).
-
-## Requirements
-- Python **3.12**
-- PyTorch (CPU-compatible build by default in Docker)
-- Docker + Docker Compose
-
-Dependencies are declared in `requirements.txt`.
-
-## Repository structure
-- `agentic_ran/`
-  - `data_loading.py`: dataset loading and fallback behavior
-  - `preprocessing.py`: feature extraction, scaling, sequence building, splitting
-  - `models.py`: model factory and architectures
-  - `training.py`: training loop
-  - `evaluation.py`: metric computation and composite scoring
-  - `reporting.py`: outputs and plots
-  - `scenarios.py`: scenario catalog and hyperparameters
-- `scripts/prepare_splits.py`: raw-data preparation and train/val/test split generation
-- `scripts/run_scenario.py`: run one scenario
-- `scripts/run_all.py`: end-to-end prepare + run + aggregate
-- `scripts/aggregate_report.py`: final benchmark report generation (`results/report.html`)
-
-## Temporal and traffic-aware modeling assumptions
-- Base stations: 4, slices per BS: 3, 15 PRBs (3 MHz).
-- Slice semantics: slice 0 = eMBB, slice 1 = MTC, slice 2 = URLLC.
-- Scheduling policy IDs: 0=RR, 1=WF, 2=PF.
-- Dynamic slice-resizing is encoded through experiment-second phase features.
-
-## Action space (agentic policy)
-- 0 keep_allocation
-- 1/2/3 increase_{embb|mtc|urllc}_prb
-- 4/5/6 decrease_{embb|mtc|urllc}_prb
-- 7/8/9 switch_to_{rr|wf|pf}
-
-> If action labels are used for training, they are pseudo-labels generated by the deterministic rule-based policy and **not operator ground truth**.
-
-## Experiments workflow
-### 1) Prepare data
+## Commands
+### Docker
 ```bash
 docker compose up --build prepare-data
-```
-Equivalent local command:
-```bash
-python -m scripts.prepare_splits \
-  --input-dir dataset/slice_mixed \
-  --input-dir dataset/slice_traffic \
-  --output-dir shared_data/splits
+docker compose up --build benchmark-main
+docker compose up --build benchmark-appendix
+docker compose up --build benchmark-all
+docker compose up --build report
 ```
 
-### 2) Run a single scenario
+### Local CLI
 ```bash
-docker compose up lightweight-32
+python -m src.benchmark --benchmark-scope main
+python -m src.benchmark --benchmark-scope appendix
+python -m src.benchmark --benchmark-scope all
+python -m src.benchmark --benchmark-scope foundation
+python -m src.report
 ```
-or
-```bash
-python -m scripts.run_scenario --scenario lightweight-32
-```
-
-### 3) Run complete benchmark
-```bash
-docker compose up --build run-all
-```
-
-### 4) Generate agentic decisions for one scenario
-```bash
-python -m scripts.run_scenario --scenario agentic_residual_mlp --use-action-head
-```
-Outputs:
-- `results/<scenario>/agentic_decisions.csv`
-- `results/<scenario>/agentic_summary.json`
 
-## Reproducibility guidance
-- Splits are time-aware and chronological per file (60/10/30 train/val/test).
-- Reuse the same files in `shared_data/splits/` across scenario runs.
-- Pin epochs with `EPOCHS=<N>` when comparing architectures.
-- Preserve run artifacts under `results/<scenario>/` (metrics, metadata, predictions, training logs, plots).
-- Track `shared_data/splits/summary.json` to capture file provenance, source target columns, and selected feature names.
+## Outputs
+- `results/main_benchmark.csv`
+- `results/appendix_benchmark.csv`
+- `results/model_ranking.csv`
+- `results/control_ranking.csv`
+- `results/safegraphagent_ran_metrics.csv`
+- `results/report.html`
 
-## Final report interpretation
-The report in `results/report.html` includes cumulative and per-metric views.
-
-- **Higher is better**: `R2`, `composite_score`
-- **Lower is better**: `RMSE`, `MAE`, `MAPE`, `sMAPE`, `wMAPE`
-
-The current benchmark report ranks **`liquid-baseline`** as best under cumulative composite score.
-However, you should also inspect individual metrics separately because other baselines may win on specific metrics (e.g., stronger `R2` or lower `RMSE`).
-
-## License
-This project is licensed under the **MIT License**. See `LICENSE`.
-
-
-## New benchmark scopes
-```bash
-docker-compose up --build benchmark-main
-docker-compose up --build benchmark-appendix
-docker-compose up --build benchmark-all
-docker-compose up --build report
-```
+## Scientific note
+Pseudo-label action metrics are not enough to prove real control quality. Use offline reward proxies, safety fallback behavior, and constraint adherence for control assessment.
diff --git a/configs/benchmark_models.yaml b/configs/benchmark_models.yaml
index fe4d124..18834b1 100644
--- a/configs/benchmark_models.yaml
+++ b/configs/benchmark_models.yaml
@@ -1,5 +1,31 @@
-{
-  "main_models": ["mlp_lightweight_32","mlp_balanced_small","mlp_with_time_features","residual_mlp_128","kan_baseline","gradient_boosting_baseline","agentic_residual_mlp","graph_actor_critic_ran","masked_graph_ppo_ran","safegraphagent_ran"],
-  "appendix_models": ["attention_baseline","liquid_baseline","xlstm_baseline","residual_tcn_16","residual_tcn_32","residual_liquid_tcn_16","residual_liquid_tcn_32","patchtst_baseline","tsmixer_baseline","agentic_liquid_residual","agentic_sequence_attention","agentic_patch_kan_mixer"],
-  "optional_foundation_models": ["chronos_bolt","timesfm","tiny_time_mixer","moirai"]
-}
\ No newline at end of file
+main_models:
+  - mlp_lightweight_32
+  - mlp_balanced_small
+  - mlp_with_time_features
+  - residual_mlp_128
+  - kan_baseline
+  - gradient_boosting_baseline
+  - agentic_residual_mlp
+  - graph_actor_critic_ran
+  - masked_graph_ppo_ran
+  - safegraphagent_ran
+
+appendix_models:
+  - attention_baseline
+  - liquid_baseline
+  - xlstm_baseline
+  - residual_tcn_16
+  - residual_tcn_32
+  - residual_liquid_tcn_16
+  - residual_liquid_tcn_32
+  - patchtst_baseline
+  - tsmixer_baseline
+  - agentic_liquid_residual
+  - agentic_sequence_attention
+  - agentic_patch_kan_mixer
+
+optional_foundation_models:
+  - chronos_bolt
+  - timesfm
+  - tiny_time_mixer
+  - moirai
diff --git a/docker-compose.yml b/docker-compose.yml
index a7e79e6..e3fbab2 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,13 +4,16 @@ x-common: &common
   volumes:
     - ./agentic_ran:/app/agentic_ran
     - ./scripts:/app/scripts
+    - ./src:/app/src
+    - ./models:/app/models
+    - ./policies:/app/policies
+    - ./configs:/app/configs
     - ./shared_data:/app/shared_data
     - ./results:/app/results
   environment:
     EPOCHS: ${EPOCHS:-5}
 
 services:
-
   prepare-data:
     <<: *common
     command: python -m scripts.prepare_splits --input-dir dataset/slice_mixed --input-dir dataset/slice_traffic --output-dir shared_data/splits
@@ -18,94 +21,10 @@ services:
       - ./agentic_ran:/app/agentic_ran
       - ./scripts:/app/scripts
       - ./dataset:/app/dataset
-      - ./shared_data:/app/shared_data
-      - ./results:/app/results
-
-  lightweight-32:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario lightweight-32
-
-  lightweight-64:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario lightweight-64
-
-  balanced-small:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario balanced-small
-
-  balanced-medium:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario balanced-medium
-
-  deep-performance:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario deep-performance
-
-  ultra-performance:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario ultra-performance
-
-  attention-baseline:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario attention-baseline
-
-  liquid-baseline:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario liquid-baseline
-
-  xlstm-baseline:
-    <<: *common
-    depends_on:
-      - prepare-data
-    command: python -m scripts.run_scenario --scenario xlstm-baseline
-
-  aggregator:
-    <<: *common
-    command: python -m scripts.aggregate_report
-    depends_on:
-      - lightweight-32
-      - lightweight-64
-      - balanced-small
-      - balanced-medium
-      - deep-performance
-      - ultra-performance
-      - attention-baseline
-      - liquid-baseline
-      - xlstm-baseline
-
-  run-all:
-    <<: *common
-    command: python -m scripts.run_all
-    volumes:
-      - ./agentic_ran:/app/agentic_ran
-      - ./scripts:/app/scripts
-      - ./dataset:/app/dataset
-      - ./shared_data:/app/shared_data
-      - ./results:/app/results
-
-  full-run:
-    <<: *common
-    command: python -m scripts.run_all
-    volumes:
-      - ./agentic_ran:/app/agentic_ran
-      - ./scripts:/app/scripts
-      - ./dataset:/app/dataset
+      - ./src:/app/src
+      - ./models:/app/models
+      - ./policies:/app/policies
+      - ./configs:/app/configs
       - ./shared_data:/app/shared_data
       - ./results:/app/results
 
@@ -113,14 +32,14 @@ services:
     <<: *common
     command: python -m src.benchmark --benchmark-scope main
 
-  benchmark-all:
-    <<: *common
-    command: python -m src.benchmark --benchmark-scope all
-
   benchmark-appendix:
     <<: *common
     command: python -m src.benchmark --benchmark-scope appendix
 
+  benchmark-all:
+    <<: *common
+    command: python -m src.benchmark --benchmark-scope all
+
   report:
     <<: *common
     command: python -m src.report
diff --git a/requirements.txt b/requirements.txt
index 3e4682c..6a726da 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ numpy>=1.26
 pandas>=2.2
 matplotlib>=3.8
 torch>=2.3
+
diff --git a/src/benchmark.py b/src/benchmark.py
index 7f66aa8..6637d6a 100644
--- a/src/benchmark.py
+++ b/src/benchmark.py
@@ -1,37 +1,84 @@
 from __future__ import annotations
-import argparse, json
+
+import argparse
 from pathlib import Path
+
 import pandas as pd
-from src.ranking import rank_forecasting, rank_control
+
+from src.ranking import rank_control, rank_forecasting
+
+
+def _load_yaml_like(path: Path) -> dict:
+    data: dict[str, list[str]] = {}
+    current = None
+    for raw in path.read_text(encoding="utf-8").splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#"):
+            continue
+        if line.endswith(":"):
+            current = line[:-1]
+            data[current] = []
+            continue
+        if line.startswith("-") and current:
+            data[current].append(line[1:].strip())
+    return data
 
 
 def load_models(scope: str):
-    cfg = json.loads(Path('configs/benchmark_models.yaml').read_text())
-    if scope=='main': return cfg['main_models']
-    if scope=='appendix': return cfg['appendix_models']
-    if scope=='foundation': return cfg['optional_foundation_models']
-    if scope=='all': return cfg['main_models']+cfg['appendix_models']
+    cfg = _load_yaml_like(Path("configs/benchmark_models.yaml"))
+    if scope == "main":
+        return cfg["main_models"]
+    if scope == "appendix":
+        return cfg["appendix_models"]
+    if scope == "foundation":
+        return cfg["optional_foundation_models"]
+    if scope == "all":
+        return cfg["main_models"] + cfg["appendix_models"]
     raise ValueError(scope)
 
+
 def mock_metrics(model):
-    return {"model":model,"r2":0.7,"rmse":0.3,"mae":0.2,"smape":10.0,"wmape":9.0,"composite_score":60.0,"peak_mae":0.3,"peak_rmse":0.4,"peak_r2":0.6,
-            "average_reward":0.1 if 'graph' in model or 'agentic' in model or 'safe' in model else None,
-            "cumulative_reward":10.0 if 'graph' in model or 'agentic' in model or 'safe' in model else None,
-            "action_switch_rate":0.2 if 'graph' in model or 'agentic' in model or 'safe' in model else None,
-            "safe_fallback_rate":0.05 if 'graph' in model or 'agentic' in model or 'safe' in model else None,
-            "avg_decision_confidence":0.75 if 'graph' in model or 'agentic' in model or 'safe' in model else None,
-            "pseudo_label_action_accuracy":0.5 if 'graph' in model or 'agentic' in model or 'safe' in model else None}
+    is_control = any(k in model for k in ("graph", "agentic", "safe"))
+    return {
+        "model": model,
+        "r2": 0.7,
+        "rmse": 0.3,
+        "mae": 0.2,
+        "smape": 10.0,
+        "wmape": 9.0,
+        "composite_score": 60.0,
+        "peak_mae": 0.3,
+        "peak_rmse": 0.4,
+        "peak_r2": 0.6,
+        "average_reward": 0.1 if is_control else None,
+        "cumulative_reward": 10.0 if is_control else None,
+        "action_switch_rate": 0.2 if is_control else None,
+        "safe_fallback_rate": 0.05 if is_control else None,
+        "avg_decision_confidence": 0.75 if is_control else None,
+        "pseudo_label_action_accuracy": 0.5 if is_control else None,
+    }
+
 
 def main():
-    ap=argparse.ArgumentParser(); ap.add_argument('--benchmark-scope',default='main',choices=['main','appendix','all','foundation']); args=ap.parse_args()
-    models=load_models(args.benchmark_scope)
-    rows=[mock_metrics(m) for m in models]
-    df=pd.DataFrame(rows)
-    out=Path('results'); out.mkdir(exist_ok=True)
-    if args.benchmark_scope in ('main','all'): df[df['model'].isin(load_models('main'))].to_csv(out/'main_benchmark.csv',index=False)
-    if args.benchmark_scope in ('appendix','all'): df[df['model'].isin(load_models('appendix'))].to_csv(out/'appendix_benchmark.csv',index=False)
-    rank_forecasting(df).to_csv(out/'model_ranking.csv',index=False)
-    rank_control(df.dropna(subset=['average_reward'], how='all')).to_csv(out/'control_ranking.csv',index=False)
-    df[df['model']=='safegraphagent_ran'].to_csv(out/'safegraphagent_ran_metrics.csv',index=False)
-
-if __name__=='__main__': main()
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--benchmark-scope", default="main", choices=["main", "appendix", "all", "foundation"])
+    args = ap.parse_args()
+
+    models = load_models(args.benchmark_scope)
+    rows = [mock_metrics(m) for m in models]
+    df = pd.DataFrame(rows)
+    out = Path("results")
+    out.mkdir(exist_ok=True)
+
+    if args.benchmark_scope in ("main", "all"):
+        df[df["model"].isin(load_models("main"))].to_csv(out / "main_benchmark.csv", index=False)
+    if args.benchmark_scope in ("appendix", "all"):
+        df[df["model"].isin(load_models("appendix"))].to_csv(out / "appendix_benchmark.csv", index=False)
+
+    rank_forecasting(df).to_csv(out / "model_ranking.csv", index=False)
+    rank_control(df.dropna(subset=["average_reward"], how="all")).to_csv(out / "control_ranking.csv", index=False)
+    df[df["model"] == "safegraphagent_ran"].to_csv(out / "safegraphagent_ran_metrics.csv", index=False)
+
+
+if __name__ == "__main__":
+    main()