diff --git a/.env.example b/.env.example index c0bef7f..4047200 100644 --- a/.env.example +++ b/.env.example @@ -6,6 +6,40 @@ GID=1000 # Anthropic ANTHROPIC_API_KEY=sk-ant-... +# ── Auth (JWT) ──────────────────────────────────────────────────────────────── +# Leave ALL of these blank to run in dev mode (no auth, anonymous superuser). +# +# Option A — HS256 shared secret (simple / internal): +# JWT_SECRET=supersecretkey +# +# Option B — RS256 via JWKS (Cognito, Auth0, Okta — recommended for production): +# JWT_JWKS_URL=https://cognito-idp.eu-west-1.amazonaws.com//.well-known/jwks.json +# JWT_ISSUER=https://cognito-idp.eu-west-1.amazonaws.com/ +# JWT_AUDIENCE= # optional but recommended +# +# Option B2 — Microsoft Entra ID (Azure AD): +# JWT_JWKS_URL=https://login.microsoftonline.com//discovery/v2.0/keys +# JWT_ISSUER=https://login.microsoftonline.com//v2.0 +# JWT_AUDIENCE= +# +JWT_SECRET= +JWT_JWKS_URL= +JWT_ISSUER= +JWT_AUDIENCE= + +# ── Frontend Azure AD SSO (Vite build-time) ─────────────────────────────────── +# Leave blank to run without SSO (dev mode — no login screen). +# Must match the app registration in Azure Entra ID. +VITE_AZURE_CLIENT_ID= +VITE_AZURE_TENANT_ID= + +# ── Permissions (DynamoDB) ──────────────────────────────────────────────────── +# Leave blank to run in dev mode (all users get superuser permissions). +# Table schema: PK=user_id (S), email (S), role_arn (S), +# allowed_datasets (SS), allowed_namespaces (SS), is_admin (BOOL) +# +DYNAMODB_PERMISSIONS_TABLE= + # AWS (puede ser IAM estático o credenciales temporales de STS) AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= diff --git a/.gitignore b/.gitignore index 69d20e8..0f6dd9a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ data/ openlineage/data/ openlineage/events.ndjson chroma_data/ +knowledge/ # Caché de Maven / Ivy (JARs descargados por spark.jars.packages) .ivy2/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..bb9800f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,21 @@ +# Changelog + +## Unreleased + +### 2026-05-06 +- **docs: add contributing guide and MIT license** — Added CONTRIBUTING.md with setup instructions, project layout, development workflows, and commit conventions; added MIT LICENSE.md to establish open-source governance. + +### 2026-05-05 +- **feat(auth): add JWT authentication and role-based permissions** — Added JWT authentication (HS256 and RS256/JWKS) with support for dev mode, role-based access control via DynamoDB, and Azure AD SSO for frontend. Includes new /api/me endpoint and permission checks on API routes. + +### 2026-05-05 +- **refactor(hooks): defer changelog updates to post-commit hook** — Moved changelog file writing to post-commit hook to ensure clean staging area during prepare-commit-msg phase, fixing issues with changelog generation workflow. + +### 2026-05-05 +- **style: add Rootly logo to header and empty state** — Added Rootly logo image to application header and empty state UI for improved branding and visual identity. + +### 2026-05-05 +- **refactor: decompose GraphView and enhance RAG with semantic search** — Refactored GraphView into modular components (controls, context menu, layout), added semantic search with Athena integration, improved backend caching of filter terms and datasets, increased proxy timeout to 300s, and enhanced UI with markdown tables, code copy buttons, and loading suggestions. + +### 2026-05-05 +- **feat(rag): add Claude commit hooks and improve knowledge chunking** — Added git hooks that auto-generate conventional commit messages and changelog entries using Claude Haiku; improved knowledge document splitting to handle large tables and long sections intelligently. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..f3f0975 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,91 @@ +# Contributing to Rootly + +## Prerequisites + +- Docker + Docker Compose +- Python 3.11+ +- Node 20+ (frontend) +- An `ANTHROPIC_API_KEY` + +## Local setup + +```bash +cp .env.example .env # fill in ANTHROPIC_API_KEY (and S3 vars if needed) +docker compose up --build +``` + +Without S3 access, the system runs against the sample events in `openlineage/` and `examples/`. + +## Project layout + +| Path | Responsibility | +|---|---| +| `rag/` | Ingestion, vectorization, RAG pipeline, tools | +| `backend/` | FastAPI app, Celery tasks | +| `frontend/src/` | React + TypeScript UI | +| `knowledge/` | Business docs indexed into ChromaDB | +| `conf/` | Domain/agent configuration YAML | + +## Making changes + +### Backend / RAG + +1. Edit code under `rag/` or `backend/`. +2. Restart the backend container: `docker compose restart backend`. +3. Re-index if you changed ingestion or vectorization: `POST /api/sync` or `python -m rag.query sync`. + +### Frontend + +```bash +cd frontend +npm install +npm run dev # dev server at http://localhost:5173 +npm run build # production build +``` + +### Adding a new RAG tool + +1. Create `rag/tools/.py` and implement the handler. +2. Register it in `rag/tools/__init__.py` (add to `TOOLS` list and `execute_tool_call` dispatcher). +3. Add a row to the tools table in `CLAUDE.md`. + +## Testing + +```bash +# Quick smoke test against local events +python -m rag.query ask "¿Qué datasets existen?" + +# Impact analysis +python -m rag.query impact +``` + +There is no automated test suite yet. Manual verification against `examples/` data is the current approach. + +## Commit style + +Follow [Conventional Commits](https://www.conventionalcommits.org/): + +``` +feat(rag): add reranking step to pipeline +fix(backend): avoid reload race on task completion +refactor(tools): extract S3 fetch helper +``` + +Scope is optional but encouraged (`rag`, `backend`, `frontend`, `tools`, `ingest`). + +## Pull requests + +- Branch from `main`, target `main`. +- One logical change per PR. +- Include a short description of *why*, not just what. +- If you change the RAG pipeline, note whether ChromaDB needs a full re-sync. + +## Environment variables + +| Variable | Required | Default | Purpose | +|---|---|---|---| +| `ANTHROPIC_API_KEY` | yes | — | Claude API access | +| `S3_BUCKET` | no | — | Source of OpenLineage events and Glue job code | +| `S3_EVENTS_PREFIX` | no | `openlineage/` | S3 prefix for `.ndjson` event files | +| `S3_JOBS_PREFIX` | no | `code/glue/jobs/AEMET/` | S3 prefix for Glue `.py` files | +| `REDIS_URL` | no | `redis://redis:6379/0` | Celery broker | diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..5ce9a24 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2026 lucabem + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/backend/auth.py b/backend/auth.py new file mode 100644 index 0000000..13312c7 --- /dev/null +++ b/backend/auth.py @@ -0,0 +1,120 @@ +""" +JWT authentication dependency for FastAPI. + +Modes (auto-detected from env vars, in priority order): + 1. RS256/JWKS — JWT_JWKS_URL set (Cognito, Auth0, Okta) + 2. HS256 — JWT_SECRET set (dev / internal shared secret) + 3. Dev mode — neither set — returns anonymous superuser, no HTTP error + +Required JWT claims: sub (user_id), email (optional, falls back to sub). +""" + +import logging +import os +import time +from dataclasses import dataclass +from typing import Optional + +import requests +from fastapi import HTTPException, Security +from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer +from jose import JWTError, jwk, jwt + +logger = logging.getLogger(__name__) + +JWT_SECRET = os.getenv("JWT_SECRET", "") +JWT_JWKS_URL = os.getenv("JWT_JWKS_URL", "") +JWT_ISSUER = os.getenv("JWT_ISSUER", "") +JWT_AUDIENCE = os.getenv("JWT_AUDIENCE", "") + +_bearer = HTTPBearer(auto_error=False) + +# JWKS cache: avoid fetching on every request +_jwks_cache: dict = {"keys": [], "fetched_at": 0.0} +_JWKS_TTL = 3600 # re-fetch after 1 hour + + +@dataclass +class AuthUser: + user_id: str + email: str + + +def _get_jwks() -> dict: + now = time.time() + if now - _jwks_cache["fetched_at"] < _JWKS_TTL and _jwks_cache["keys"]: + return _jwks_cache + try: + resp = requests.get(JWT_JWKS_URL, timeout=10) + resp.raise_for_status() + data = resp.json() + _jwks_cache.update({"keys": data.get("keys", []), "fetched_at": now}) + return _jwks_cache + except Exception as e: + logger.error(f"Failed to fetch JWKS from {JWT_JWKS_URL}: {e}") + if _jwks_cache["keys"]: + return _jwks_cache # serve stale on transient errors + raise HTTPException(status_code=503, detail="Auth service unavailable.") + + +def _decode_rs256(token: str) -> dict: + header = jwt.get_unverified_header(token) + kid = header.get("kid") + jwks = _get_jwks() + key_data = next((k for k in jwks["keys"] if k.get("kid") == kid), None) + if key_data is None: + # Retry once — key may have rotated + _jwks_cache["fetched_at"] = 0.0 + jwks = _get_jwks() + key_data = next((k for k in jwks["keys"] if k.get("kid") == kid), None) + if key_data is None: + raise JWTError(f"No public key found for kid={kid!r}") + public_key = jwk.construct(key_data) + options: dict = {"verify_aud": bool(JWT_AUDIENCE)} + return jwt.decode( + token, + public_key.to_dict(), + algorithms=["RS256"], + audience=JWT_AUDIENCE or None, + issuer=JWT_ISSUER or None, + options=options, + ) + + +def _decode_hs256(token: str) -> dict: + options: dict = {"verify_aud": bool(JWT_AUDIENCE)} + return jwt.decode( + token, + JWT_SECRET, + algorithms=["HS256"], + audience=JWT_AUDIENCE or None, + issuer=JWT_ISSUER or None, + options=options, + ) + + +def get_current_user( + credentials: Optional[HTTPAuthorizationCredentials] = Security(_bearer), +) -> AuthUser: + # Dev mode: no auth configured → anonymous superuser + if not JWT_SECRET and not JWT_JWKS_URL: + return AuthUser(user_id="anonymous", email="anonymous@local") + + if credentials is None: + raise HTTPException(status_code=401, detail="Authorization header required.") + + token = credentials.credentials + try: + if JWT_JWKS_URL: + payload = _decode_rs256(token) + else: + payload = _decode_hs256(token) + except JWTError as e: + raise HTTPException(status_code=401, detail=f"Invalid token: {e}") + + user_id: Optional[str] = payload.get("sub") + if not user_id: + raise HTTPException(status_code=401, detail="Token missing 'sub' claim.") + + email: str = payload.get("email") or payload.get("username") or user_id + return AuthUser(user_id=user_id, email=email) diff --git a/backend/main.py b/backend/main.py index 6418377..dbe8ed0 100644 --- a/backend/main.py +++ b/backend/main.py @@ -7,10 +7,12 @@ load_dotenv() -from fastapi import FastAPI, HTTPException +from fastapi import Depends, FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel +from backend.permissions import UserPermissions, get_permissions + from celery.result import AsyncResult from backend.tasks import ( @@ -26,14 +28,14 @@ ) from rag.tools.dataset import downstream_layers as _downstream_layers from rag.query import ask, load_history, _downstream_layers -from rag.vectorize import get_collection +from rag.vectorize import get_collection, load_graph_cache app = FastAPI(title="RAG Lineage API") -# @app.on_event("startup") -# def startup(): -# threading.Thread(target=_ensure_loaded, daemon=True).start() +@app.on_event("startup") +def startup(): + threading.Thread(target=_ensure_loaded, daemon=True).start() app.add_middleware( @@ -43,7 +45,7 @@ allow_headers=["*"], ) -_state: dict = {"G": None, "collection": None, "loaded_at": 0.0} +_state: dict = {"G": None, "collection": None, "loaded_at": 0.0, "filter_terms": None, "datasets_resp": None} _load_lock = threading.Lock() _last_reloaded_task: str | None = None @@ -58,13 +60,38 @@ def _ensure_loaded(): if _state["collection"] is not None: # otro hilo cargó mientras esperábamos return _state["G"], _state["collection"] - events = load_events() - G = build_graph(events) collection = get_collection() - if collection.count() == 0: - print("[ensure_loaded] WARNING: ChromaDB is empty. Run /api/sync.") + if collection.count() > 0: + G = load_graph_cache() + if G is not None: + print(f"[ensure_loaded] Restored from cache — {G.number_of_nodes()} nodes, {collection.count()} docs in ChromaDB.") + else: + print("[ensure_loaded] ChromaDB has data but no graph cache — rebuilding from local events.") + events = load_events() + G = build_graph(events) + else: + print("[ensure_loaded] ChromaDB is empty. Run /api/sync to populate.") + events = load_events() + G = build_graph(events) + _state.update({"G": G, "collection": collection, "loaded_at": time.time()}) + all_metas = collection.get(include=["metadatas"])["metadatas"] or [] + _state["filter_terms"] = { + "namespaces": {str(v) for m in all_metas if (v := m.get("namespace"))}, + "names": {str(v) for m in all_metas if (v := m.get("name"))}, + } + + datasets = [ + {"key": k, "namespace": d.get("namespace", ""), "name": d.get("name", k)} + for k, d in G.nodes(data=True) + if d.get("kind") == "dataset" + ] + _state["datasets_resp"] = { + "datasets": datasets, + "namespaces": sorted({d["namespace"] for d in datasets if d["namespace"]}), + } + return _state["G"], _state["collection"] @@ -103,8 +130,20 @@ def health(): return {"status": "ok"} +@app.get("/api/me") +def me(perms: UserPermissions = Depends(get_permissions)): + return { + "user_id": perms.user_id, + "email": perms.email, + "is_admin": perms.is_admin, + "allowed_datasets": perms.allowed_datasets, + "allowed_namespaces": perms.allowed_namespaces, + "has_role_arn": perms.role_arn is not None, + } + + @app.get("/api/stats") -def stats(): +def stats(perms: UserPermissions = Depends(get_permissions)): try: G, collection = _state["G"], _state["collection"] n_ds = sum(1 for _, d in G.nodes(data=True) if d.get("kind") == "dataset") @@ -126,7 +165,7 @@ def stats(): @app.post("/api/chat") -def chat(req: ChatRequest): +def chat(req: ChatRequest, perms: UserPermissions = Depends(get_permissions)): try: history = [{"role": m.role, "content": m.content} for m in req.history] answer = ask( @@ -137,6 +176,7 @@ def chat(req: ChatRequest): G=_state["G"], bucket=S3_BUCKET, jobs_prefix=S3_JOBS_PREFIX, + filter_terms=_state.get("filter_terms"), ) _save_chat(req.question, answer) return {"answer": answer} @@ -151,7 +191,9 @@ def chat(req: ChatRequest): @app.get("/api/datasets") -def list_datasets(): +def list_datasets(perms: UserPermissions = Depends(get_permissions)): + if _state.get("datasets_resp"): + return _state["datasets_resp"] try: G = _state["G"] datasets = [] @@ -171,7 +213,7 @@ def list_datasets(): @app.post("/api/impact") -def impact(req: ImpactRequest): +def impact(req: ImpactRequest, perms: UserPermissions = Depends(get_permissions)): G = _state["G"] if G is None: raise HTTPException(status_code=503, detail="Graph not loaded yet.") @@ -208,7 +250,7 @@ def impact(req: ImpactRequest): @app.post("/api/sync") -def do_sync(): +def do_sync(perms: UserPermissions = Depends(get_permissions)): try: task = run_sync_task.delay( bucket=S3_BUCKET, @@ -226,7 +268,7 @@ def do_sync(): @app.get("/api/task/{task_id}") -def task_status(task_id: str): +def task_status(task_id: str, perms: UserPermissions = Depends(get_permissions)): global _last_reloaded_task result = AsyncResult(task_id, app=celery_app) data: dict = {"task_id": task_id, "status": result.state} @@ -245,6 +287,8 @@ def task_status(task_id: str): _state["G"] = None _state["collection"] = None _state["loaded_at"] = 0.0 + _state["filter_terms"] = None + _state["datasets_resp"] = None should_reload = True else: print(f"[task_status] task={task_id} SUCCESS (already processed) — skipping reload.") @@ -257,7 +301,7 @@ def task_status(task_id: str): @app.get("/api/graph") -def graph(): +def graph(perms: UserPermissions = Depends(get_permissions)): G = _state["G"] if G is None: raise HTTPException(status_code=503, detail="Graph not loaded yet.") @@ -278,32 +322,87 @@ def graph(): return {"nodes": nodes, "edges": edges} +@app.get("/api/namespaces") +def namespaces(perms: UserPermissions = Depends(get_permissions)): + G = _state["G"] + if G is None: + return {"namespaces": []} + ns = sorted({ + d.get("namespace", "") + for _, d in G.nodes(data=True) + if d.get("kind") == "dataset" and d.get("namespace") + }) + return {"namespaces": ns} + + +@app.get("/api/tables") +def tables(namespace: str = "", perms: UserPermissions = Depends(get_permissions)): + G = _state["G"] + if G is None: + return {"tables": []} + result = sorted({ + d.get("name", k) + for k, d in G.nodes(data=True) + if d.get("kind") == "dataset" + and (not namespace or d.get("namespace", "") == namespace) + }) + return {"tables": result} + + +@app.get("/api/schema") +def schema(dataset: str, namespace: str | None = None, perms: UserPermissions = Depends(get_permissions)): + G = _state["G"] + if G is None: + raise HTTPException(status_code=503, detail="Graph not loaded yet.") + matches = [ + d for _, d in G.nodes(data=True) + if d.get("kind") == "dataset" + and d.get("name", "").lower() == dataset.lower() + and (not namespace or d.get("namespace", "") == namespace) + ] + if not matches: + raise HTTPException(status_code=404, detail=f"Dataset '{dataset}' not found.") + node = matches[0] + columns = [] + for raw in node.get("schema", []): + if " (" in raw and raw.endswith(")"): + name, type_part = raw.rsplit(" (", 1) + columns.append({"name": name, "type": type_part[:-1]}) + else: + columns.append({"name": raw, "type": ""}) + return { + "dataset": node.get("name"), + "namespace": node.get("namespace"), + "columns": columns, + } + + @app.get("/api/trace") -def trace(dataset: str, field: str | None = None): +def trace(dataset: str, field: str | None = None, namespace: str | None = None, perms: UserPermissions = Depends(get_permissions)): """ Trace column-level lineage for a field in a dataset. Useful for GDPR audits: find the exact origin of a sensitive field. ?dataset=customers&field=email -> recursive trace of email in customers ?dataset=customers -> full column lineage for the dataset + ?namespace=ns&dataset=customers -> scoped to a specific namespace """ G = _state["G"] if G is None: raise HTTPException(status_code=503, detail="Graph not loaded yet.") - matches = [ - (k, d) - for k, d in G.nodes(data=True) - if d.get("kind") == "dataset" - and dataset.lower() in d.get("name", "").lower() - ] + all_ds = [(k, d) for k, d in G.nodes(data=True) if d.get("kind") == "dataset"] + if namespace: + all_ds = [(k, d) for k, d in all_ds if d.get("namespace", "") == namespace] + exact = [(k, d) for k, d in all_ds if d.get("name", "").lower() == dataset.lower()] + matches = exact or [(k, d) for k, d in all_ds if dataset.lower() in d.get("name", "").lower()] if not matches: raise HTTPException(status_code=404, detail=f"Dataset '{dataset}' not found.") def _trace_field(ds_name: str, out_field: str, visited: set, depth: int) -> dict: """Recursively trace a field back to its origins.""" if depth > 6 or (ds_name, out_field) in visited: - return {"field": out_field, "dataset": ds_name, "sources": [], "cycle": depth > 6} + return {"field": out_field, "dataset": ds_name, "namespace": "", "sources": [], "cycle": depth > 6} visited.add((ds_name, out_field)) src_node = next( @@ -313,6 +412,7 @@ def _trace_field(ds_name: str, out_field: str, visited: set, depth: int) -> dict ), None, ) + ns = src_node[1].get("namespace", "") if src_node else "" col_lineage = src_node[1].get("column_lineage", {}) if src_node else {} sources_meta = col_lineage.get(out_field, []) @@ -323,7 +423,7 @@ def _trace_field(ds_name: str, out_field: str, visited: set, depth: int) -> dict child["transform_subtype"] = s.get("subtype", "?") sources.append(child) - return {"field": out_field, "dataset": ds_name, "sources": sources} + return {"field": out_field, "dataset": ds_name, "namespace": ns, "sources": sources} results = [] for node_key, data in matches: @@ -360,13 +460,13 @@ def _trace_field(ds_name: str, out_field: str, visited: set, depth: int) -> dict @app.get("/api/history") -def history(n: int = 20): +def history(n: int = 20, perms: UserPermissions = Depends(get_permissions)): msgs = load_history(n=n, chat_dir=CHAT_DIR) return {"messages": msgs} @app.get("/api/config") -def config(): +def config(perms: UserPermissions = Depends(get_permissions)): return { "mode": "s3" if S3_BUCKET else "local", "bucket": S3_BUCKET, diff --git a/backend/permissions.py b/backend/permissions.py new file mode 100644 index 0000000..bec2c45 --- /dev/null +++ b/backend/permissions.py @@ -0,0 +1,115 @@ +""" +Permission store backed by DynamoDB. + +DynamoDB table schema (set DYNAMODB_PERMISSIONS_TABLE env var): + PK: user_id (String) — matches JWT 'sub' claim + email (String) — display / audit + role_arn (String) — IAM role ARN for STS AssumeRole (Athena/S3) + allowed_datasets (SS) — StringSet; absent = no restriction + allowed_namespaces (SS) — StringSet; absent = no restriction + is_admin (Boolean) — bypasses all dataset restrictions + +Dev mode (DYNAMODB_PERMISSIONS_TABLE not set): every authenticated user +gets superuser permissions so local development works without AWS. + +Example CLI to create the table and seed a user: + aws dynamodb create-table \ + --table-name rootly-permissions \ + --attribute-definitions AttributeName=user_id,AttributeType=S \ + --key-schema AttributeName=user_id,KeyType=HASH \ + --billing-mode PAY_PER_REQUEST + + aws dynamodb put-item --table-name rootly-permissions --item '{ + "user_id": {"S": "auth0|abc123"}, + "email": {"S": "alice@company.com"}, + "role_arn": {"S": "arn:aws:iam::123456789:role/DataAnalyst"}, + "allowed_datasets": {"SS": ["orders", "customers", "contratos"]}, + "allowed_namespaces": {"SS": ["s3://prod-bucket/data/"]}, + "is_admin": {"BOOL": false} + }' +""" + +import logging +import os +from dataclasses import dataclass, field +from typing import Optional + +import boto3 +from botocore.exceptions import ClientError +from fastapi import Depends, HTTPException + +from backend.auth import AuthUser, get_current_user + +logger = logging.getLogger(__name__) + +PERMISSIONS_TABLE = os.getenv("DYNAMODB_PERMISSIONS_TABLE", "") +AWS_REGION = os.getenv("AWS_DEFAULT_REGION", "eu-west-1") + + +@dataclass +class UserPermissions: + user_id: str + email: str + role_arn: Optional[str] + # None means unrestricted (all datasets / namespaces allowed) + allowed_datasets: Optional[list[str]] + allowed_namespaces: Optional[list[str]] + is_admin: bool = False + + def can_access_dataset(self, dataset_name: str) -> bool: + if self.is_admin or self.allowed_datasets is None: + return True + return dataset_name in self.allowed_datasets + + def can_access_namespace(self, namespace: str) -> bool: + if self.is_admin or self.allowed_namespaces is None: + return True + return namespace in self.allowed_namespaces + + +def _superuser(user: AuthUser) -> UserPermissions: + return UserPermissions( + user_id=user.user_id, + email=user.email, + role_arn=None, + allowed_datasets=None, + allowed_namespaces=None, + is_admin=True, + ) + + +def _dynamo_client(): + return boto3.resource("dynamodb", region_name=AWS_REGION) + + +def get_permissions(user: AuthUser = Depends(get_current_user)) -> UserPermissions: + if not PERMISSIONS_TABLE: + logger.debug("DYNAMODB_PERMISSIONS_TABLE not set — dev mode, returning superuser.") + return _superuser(user) + + # anonymous in dev mode (JWT_SECRET / JWT_JWKS_URL not set) → superuser + if user.user_id == "anonymous": + return _superuser(user) + + try: + table = _dynamo_client().Table(PERMISSIONS_TABLE) + resp = table.get_item(Key={"user_id": user.user_id}) + item = resp.get("Item") + except ClientError as e: + logger.error(f"DynamoDB lookup failed for user_id={user.user_id}: {e}") + raise HTTPException(status_code=503, detail="Permission store unavailable.") + + if item is None: + raise HTTPException(status_code=403, detail="User not authorized.") + + raw_datasets = item.get("allowed_datasets") + raw_namespaces = item.get("allowed_namespaces") + + return UserPermissions( + user_id=user.user_id, + email=item.get("email", user.email), + role_arn=item.get("role_arn"), + allowed_datasets=list(raw_datasets) if raw_datasets is not None else None, + allowed_namespaces=list(raw_namespaces) if raw_namespaces is not None else None, + is_admin=bool(item.get("is_admin", False)), + ) diff --git a/backend/requirements.txt b/backend/requirements.txt index 0fc729e..b21203e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,3 +1,5 @@ fastapi>=0.110.0 uvicorn[standard]>=0.27.0 celery[redis]>=5.3.0 +python-jose[cryptography]>=3.3.0 +requests>=2.31.0 diff --git a/docker-compose.yml b/docker-compose.yml index aad4b69..c7f97f5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -83,6 +83,9 @@ services: build: context: frontend dockerfile: Dockerfile + args: + VITE_AZURE_CLIENT_ID: ${VITE_AZURE_CLIENT_ID:-} + VITE_AZURE_TENANT_ID: ${VITE_AZURE_TENANT_ID:-} container_name: rag-ui ports: diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 0d708e6..4b98fcd 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -3,6 +3,10 @@ WORKDIR /app COPY package.json ./ RUN npm install COPY . . +ARG VITE_AZURE_CLIENT_ID +ARG VITE_AZURE_TENANT_ID +ENV VITE_AZURE_CLIENT_ID=$VITE_AZURE_CLIENT_ID +ENV VITE_AZURE_TENANT_ID=$VITE_AZURE_TENANT_ID RUN npm run build FROM nginx:alpine diff --git a/frontend/index.html b/frontend/index.html index 1d17ffe..1131e99 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -3,7 +3,7 @@ - RAG Lineage + Rootly
diff --git a/frontend/nginx.conf b/frontend/nginx.conf index a92b5de..4d8e02a 100644 --- a/frontend/nginx.conf +++ b/frontend/nginx.conf @@ -12,6 +12,6 @@ server { proxy_http_version 1.1; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; - proxy_read_timeout 120s; + proxy_read_timeout 300s; } } diff --git a/frontend/package-lock.json b/frontend/package-lock.json index a113cfe..39b93ac 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -8,6 +8,8 @@ "name": "rag-lineage-ui", "version": "0.1.0", "dependencies": { + "@azure/msal-browser": "^3.0.0", + "@azure/msal-react": "^2.0.0", "@dagrejs/dagre": "^3.0.0", "react": "^18.2.0", "react-dom": "^18.2.0", @@ -23,6 +25,40 @@ "vite": "^5.1.0" } }, + "node_modules/@azure/msal-browser": { + "version": "3.30.0", + "resolved": "https://registry.npmjs.org/@azure/msal-browser/-/msal-browser-3.30.0.tgz", + "integrity": "sha512-I0XlIGVdM4E9kYP5eTjgW8fgATdzwxJvQ6bm2PNiHaZhEuUz47NYw1xHthC9R+lXz4i9zbShS0VdLyxd7n0GGA==", + "license": "MIT", + "dependencies": { + "@azure/msal-common": "14.16.1" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-common": { + "version": "14.16.1", + "resolved": "https://registry.npmjs.org/@azure/msal-common/-/msal-common-14.16.1.tgz", + "integrity": "sha512-nyxsA6NA4SVKh5YyRpbSXiMr7oQbwark7JU9LMeg6tJYTSPyAGkdx61wPT4gyxZfxlSxMMEyAsWaubBlNyIa1w==", + "license": "MIT", + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/@azure/msal-react": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@azure/msal-react/-/msal-react-2.2.0.tgz", + "integrity": "sha512-2V+9JXeXyyjYNF92y5u0tU4el9px/V1+vkRuN+DtoxyiMHCtYQpJoaFdGWArh43zhz5aqQqiGW/iajPDSu3QsQ==", + "license": "MIT", + "engines": { + "node": ">=10" + }, + "peerDependencies": { + "@azure/msal-browser": "^3.27.0", + "react": "^16.8.0 || ^17 || ^18" + } + }, "node_modules/@babel/code-frame": { "version": "7.29.0", "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.0.tgz", diff --git a/frontend/package.json b/frontend/package.json index 1b57159..dba440d 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -8,6 +8,8 @@ "preview": "vite preview" }, "dependencies": { + "@azure/msal-browser": "^3.0.0", + "@azure/msal-react": "^2.0.0", "@dagrejs/dagre": "^3.0.0", "react": "^18.2.0", "react-dom": "^18.2.0", diff --git a/frontend/public/rootly.jpeg b/frontend/public/rootly.jpeg new file mode 100644 index 0000000..7199542 Binary files /dev/null and b/frontend/public/rootly.jpeg differ diff --git a/frontend/src/App.css b/frontend/src/App.css index b8a4820..c49ebfb 100644 --- a/frontend/src/App.css +++ b/frontend/src/App.css @@ -57,11 +57,13 @@ body { color: var(--text); } -.header-title .dot { - width: 8px; - height: 8px; +.header-logo { + width: 28px; + height: 28px; border-radius: 50%; - background: var(--accent); + object-fit: cover; + border: 1px solid var(--border); + flex-shrink: 0; } .header-stats { @@ -198,6 +200,15 @@ body { opacity: 0.4; } +.empty-logo { + width: 80px; + height: 80px; + border-radius: 50%; + object-fit: cover; + border: 2px solid var(--border); + opacity: 0.85; +} + .empty-state h3 { font-size: 16px; font-weight: 500; @@ -292,12 +303,13 @@ body { padding: 10px 14px; } -.typing span { +.typing span:not(.loading-hint) { width: 6px; height: 6px; background: var(--text-muted); border-radius: 50%; animation: bounce 1.2s infinite; + flex-shrink: 0; } .typing span:nth-child(2) { animation-delay: 0.2s; } @@ -827,3 +839,96 @@ body { from { opacity: 0; transform: translateY(10px); } to { opacity: 1; transform: translateY(0); } } + +/* ── Markdown tables ── */ +.bubble table { + width: 100%; + border-collapse: collapse; + margin: 8px 0; + font-size: 12px; +} +.bubble th, .bubble td { + padding: 6px 10px; + border: 1px solid var(--border); + text-align: left; + vertical-align: top; +} +.bubble th { + background: var(--surface2); + font-weight: 600; + color: var(--text); +} +.bubble tr:nth-child(even) td { + background: rgba(255,255,255,0.02); +} + +/* ── Code block with copy button ── */ +.code-block-wrapper { + position: relative; + margin: 8px 0; +} +.copy-btn { + position: absolute; + top: 6px; + right: 6px; + background: var(--surface2); + border: 1px solid var(--border); + color: var(--text-muted); + border-radius: 4px; + padding: 2px 8px; + font-size: 11px; + cursor: pointer; + transition: all 0.15s; + z-index: 1; + font-family: inherit; +} +.copy-btn:hover { + border-color: var(--accent); + color: var(--accent); +} +.copy-btn.copied { + border-color: var(--success); + color: var(--success); +} +.code-block-wrapper pre { + margin: 0; +} + +/* ── Suggestion chips ── */ +.suggestions { + display: flex; + flex-wrap: wrap; + gap: 8px; + justify-content: center; + margin-top: 12px; + max-width: 480px; +} +.suggestion-chip { + background: var(--surface); + border: 1px solid var(--border); + color: var(--text-muted); + border-radius: 20px; + padding: 6px 14px; + font-size: 12px; + cursor: pointer; + transition: all 0.15s; + font-family: inherit; + text-align: left; +} +.suggestion-chip:hover { + border-color: var(--accent); + color: var(--accent); + background: var(--accent-dim); +} + +/* ── Loading hint text ── */ +.loading-hint { + font-size: 11px; + color: var(--text-muted); + margin-left: 10px; + opacity: 0.8; + font-style: italic; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 24aba65..314570e 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -1,8 +1,49 @@ import { useEffect, useRef, useState } from 'react' import ReactMarkdown from 'react-markdown' import remarkGfm from 'remark-gfm' +import { useMsal, MsalAuthenticationTemplate, useIsAuthenticated } from '@azure/msal-react' +import { InteractionType } from '@azure/msal-browser' import { Autocomplete } from './components/Autocomplete' import { GraphView } from './components/GraphView' +import { useAuthFetch } from './auth/useAuthFetch' +import { loginRequest } from './auth/msalConfig' + +const SUGGESTIONS = [ + '¿Qué datasets existen en producción?', + '¿Qué jobs producen el dataset contratos?', + '¿Qué se rompe si cambio el schema de orders?', + '¿De dónde viene la columna id_cliente?', +] + +const LOADING_HINTS = [ + 'Consultando el grafo de linaje…', + 'Buscando en el índice semántico…', + 'Ejecutando herramienta…', + 'Generando respuesta…', +] + +function CodeBlock({ children }: { children: React.ReactNode }) { + const [copied, setCopied] = useState(false) + const text = (() => { + const child = (children as any)?.[0] + return child?.props?.children ?? '' + })() + return ( +
+ +
{children}
+
+ ) +} type Role = 'user' | 'assistant' type Tab = 'chat' | 'impact' | 'graph' | 'tasks' @@ -65,7 +106,10 @@ function timeAgo(ts: number) { return `hace ${Math.floor(s / 3600)}h` } -export default function App() { +function AppInner() { + const { instance, accounts } = useMsal() + const authFetch = useAuthFetch() + const account = accounts[0] const [tab, setTab] = useState('chat') const [messages, setMessages] = useState([]) const [input, setInput] = useState('') @@ -80,6 +124,7 @@ export default function App() { const [allDatasets, setAllDatasets] = useState([]) const [allNamespaces, setAllNamespaces] = useState([]) const [tasks, setTasks] = useState(loadTasks) + const [loadingHint, setLoadingHint] = useState(0) const messagesEndRef = useRef(null) const MAX_TASK_AGE_MS = 24 * 60 * 60 * 1000 @@ -101,6 +146,13 @@ export default function App() { return () => clearTimeout(t) }, [toast]) + // Cycle loading hint text while waiting for response + useEffect(() => { + if (!loading) { setLoadingHint(0); return } + const t = setInterval(() => setLoadingHint(h => (h + 1) % LOADING_HINTS.length), 3000) + return () => clearInterval(t) + }, [loading]) + // Persist tasks useEffect(() => { saveTasks(tasks) }, [tasks]) @@ -110,7 +162,7 @@ export default function App() { const interval = setInterval(async () => { for (const task of pendingTasks) { try { - const r = await fetch(`/api/task/${task.id}`) + const r = await authFetch(`/api/task/${task.id}`) const data = await r.json() if (data.status !== task.status) { setTasks(prev => prev.map(t => @@ -125,7 +177,7 @@ export default function App() { : t )) if (data.status === 'SUCCESS') { - await fetch('/api/reload', { method: 'POST' }) + await authFetch('/api/reload', { method: 'POST' }) fetchStats() fetchDatasets() setToast({ msg: `Sync completado - ${data.result?.docs ?? 0} docs`, type: 'success' }) @@ -142,7 +194,7 @@ export default function App() { async function fetchHistory() { try { - const r = await fetch('/api/history?n=30') + const r = await authFetch('/api/history?n=30') const data = await r.json() const loaded: Message[] = (data.messages ?? []).map((m: { role: Role; content: string }) => ({ id: ++msgId, @@ -155,7 +207,7 @@ export default function App() { async function fetchDatasets() { try { - const r = await fetch('/api/datasets') + const r = await authFetch('/api/datasets') const data = await r.json() setAllDatasets(data.datasets ?? []) setAllNamespaces(data.namespaces ?? []) @@ -164,22 +216,20 @@ export default function App() { async function fetchStats() { try { - const r = await fetch('/api/stats') + const r = await authFetch('/api/stats') setStats(await r.json()) } catch { setStats({ datasets: 0, jobs: 0, edges: 0, indexed_docs: 0, error: 'offline' }) } } - async function sendMessage() { - const q = input.trim() + async function sendQuestion(q: string) { if (!q || loading) return - setInput('') setMessages(prev => [...prev, { id: ++msgId, role: 'user', content: q }]) setLoading(true) try { - const history = messages.map(m => ({ role: m.role, content: m.content })) - const r = await fetch('/api/chat', { + const history = messages.slice(-10).map(m => ({ role: m.role, content: m.content })) + const r = await authFetch('/api/chat', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ question: q, history }), @@ -195,10 +245,31 @@ export default function App() { } } + async function sendMessage() { + const q = input.trim() + if (!q) return + setInput('') + await sendQuestion(q) + } + + function handleAskAboutNode(node: { kind: string; name: string; namespace: string; field?: string }) { + setTab('chat') + const ns = node.namespace ? ` (${node.namespace})` : '' + let q: string + if (node.field) { + q = `Cuéntame sobre la columna "${node.field}" del dataset "${node.name}"${ns}: de dónde viene, qué transformaciones sufre y qué jobs la producen o modifican.` + } else if (node.kind === 'job') { + q = `Cuéntame sobre el job ETL "${node.name}": qué datasets lee y escribe, cuál es su lógica y su historial de ejecuciones recientes.` + } else { + q = `Cuéntame sobre el dataset "${node.name}"${ns}: su schema, los jobs que lo producen y consumen, y si tiene linaje de columnas disponible.` + } + sendQuestion(q) + } + async function handleSync() { setSyncing(true) try { - const r = await fetch('/api/sync', { method: 'POST' }) + const r = await authFetch('/api/sync', { method: 'POST' }) const data = await r.json() if (!r.ok) throw new Error(data.detail || 'Error') const newTask: TaskEntry = { @@ -222,7 +293,7 @@ export default function App() { setImpactLoading(true) setImpactResults(null) try { - const r = await fetch('/api/impact', { + const r = await authFetch('/api/impact', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ dataset: table, namespace: impactNamespace.trim() || null }), @@ -253,13 +324,19 @@ export default function App() { ) + const isAuthenticated = useIsAuthenticated() + + // When MSAL is configured but the user hasn't finished the redirect yet, + // render nothing — MsalAuthenticationTemplate handles the loading/redirect. + if (import.meta.env.VITE_AZURE_CLIENT_ID && !isAuthenticated) return null + return ( <> {/* Header */}
- - RAG Lineage + Rootly + Rootly
{stats && !stats.error ? ( @@ -271,6 +348,20 @@ export default function App() { ) : ( sin índice )} + {account && ( + <> + + {account.name ?? account.username} + + + + )} + ))} +
)} {messages.map(msg => ( @@ -315,7 +411,14 @@ export default function App() { {msg.role === 'user' ? 'Tú' : 'Asistente'}
{msg.role === 'assistant' - ? {msg.content} + ? ( + {children} }} + > + {msg.content} + + ) : msg.content}
@@ -323,7 +426,12 @@ export default function App() { {loading && (
Asistente -
+
+
+ + {LOADING_HINTS[loadingHint]} +
+
)}
@@ -331,7 +439,13 @@ export default function App() {