smallcode/.env.example at master · BlueFenixProductions/smallcode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# SmallCode Configuration
# Copy to .env in your project root

# ─── Model (required) ────────────────────────────────────────────────────────
# Recommended model size: 8B–35B parameters.
# SmallCode is tuned for this range — smaller models struggle with multi-step
# tool use, larger models don't need SmallCode's adaptations.
SMALLCODE_MODEL=your-model-name
SMALLCODE_BASE_URL=http://localhost:1234/v1
SMALLCODE_PROVIDER=openai

# ─── Context ─────────────────────────────────────────────────────────────────
SMALLCODE_CONTEXT_BUDGET=70

# Cache-split: keeps the system prompt stable across turns so llama.cpp can
# reuse its KV cache. Without this, dynamic content (memory, knowledge) injected
# into the system prompt invalidates llama.cpp's context checkpoints every turn,
# causing the "erased invalidated context checkpoint" loop and full re-processing.
# Default: true (enabled). Set to false only if you need legacy behaviour.
# SMALLCODE_CACHE_SPLIT=false

# Context window size (auto-detected from model endpoint if not set)
# Set this if auto-detection doesn't work for your setup
# SMALLCODE_CONTEXT_WINDOW=32768

# Max chars kept per tool result (head + tail with truncation marker beyond).
# Default 8000 (~240 lines) fits most source files in one read_file call.
# Lower it for tight context budgets; raise it to avoid multi-read sequences.
# SMALLCODE_MAX_TOOL_RESULT_CHARS=8000

# Read guard (itsy port): when context usage is past budget OR a file alone
# exceeds 50% of the window, the read guard returns the first N lines plus
# an explicit "use grep / read a smaller line range" directive instead of a
# silent middle-of-file truncation. Tune the head size or disable.
# SMALLCODE_READ_GUARD=true
# SMALLCODE_READ_GUARD_HEAD_LINES=30

# Quality monitor (itsy port): catches empty turns, blank tool names,
# hallucinated tool names, and exact-repeat tool calls across turns. On a
# hit it injects a [QUALITY-MONITOR] steer; capped at 2 consecutive
# corrections to avoid correction spirals.
# SMALLCODE_QUALITY_MONITOR=true

# Max output tokens per chat completion. Default 8192 — enough headroom for
# 6k tokens of <think> reasoning + 2k tool-call output on reasoning models.
# Lower it to save inference time on non-reasoning models.
# SMALLCODE_MAX_OUTPUT_TOKENS=8192

# ─── Bash Timeout ────────────────────────────────────────────────────────────
SMALLCODE_BASH_TIMEOUT=30

# ─── Model Timeout ───────────────────────────────────────────────────────────
# How long to wait for the model to respond (seconds). Default: 300 (5 minutes).
# Increase this if you're running on slow hardware (RK3588, CPU inference, etc.)
# and getting "model timed out" errors.
# SMALLCODE_MODEL_TIMEOUT=300
# SMALLCODE_MODEL_TIMEOUT=600   # 10 min for very slow hardware

# ─── TUI ─────────────────────────────────────────────────────────────────────
SMALLCODE_THEME=dark
SMALLCODE_AUTO_APPROVE=false
SMALLCODE_CLASSIC_TUI=false

# ─── API Keys ────────────────────────────────────────────────────────────────
# Required when using a cloud provider (OpenAI, OpenRouter, DeepSeek, Anthropic)
# Also enables auto-escalation on hard fail when using a local model
# OPENAI_API_KEY=sk-...
# ANTHROPIC_API_KEY=sk-ant-...
# DEEPSEEK_API_KEY=sk-...
#
# Override default escalation model:
# SMALLCODE_ESCALATION_MODEL=claude-sonnet-4-5

# ─── OpenRouter Example ──────────────────────────────────────────────────────
# SMALLCODE_BASE_URL=https://openrouter.ai/api/v1
# SMALLCODE_MODEL=openai/gpt-4o-mini
# OPENROUTER_API_KEY=sk-or-v1-...

# ─── Multi-Model Routing (optional) ──────────────────────────────────────────
# Auto-pick model based on task complexity. Each tier may use its own endpoint.
# SMALLCODE_MODEL_FAST=gemma-4-e4b
# SMALLCODE_MODEL_DEFAULT=qwen3-8b
# SMALLCODE_MODEL_STRONG=qwen3-14b
# SMALLCODE_BASE_URL_FAST=http://localhost:1234/v1
# SMALLCODE_BASE_URL_DEFAULT=http://localhost:1234/v1
# SMALLCODE_BASE_URL_STRONG=https://openrouter.ai/api/v1
#
# Local default + OpenRouter strong model:
# SMALLCODE_MODEL=qwen3:8b
# SMALLCODE_BASE_URL=http://localhost:11434/v1
# SMALLCODE_MODEL_STRONG=openai/gpt-4o-mini
# SMALLCODE_BASE_URL_STRONG=https://openrouter.ai/api/v1
# OPENROUTER_API_KEY=sk-or-v1-...

# ─── RTK (Rust Token Killer) ─────────────────────────────────────────────────
# RTK auto-rewrites bash commands (git, tests, ls, etc.) for 60-90% token savings.
# Install: https://github.com/rtk-ai/rtk
# SmallCode auto-detects rtk on PATH — no config needed.
# To disable: SMALLCODE_RTK=false
# SMALLCODE_RTK=false
SMALLCODE_AUTO_COMMIT=false

# ─── Thinking / Reasoning Display ───────────────────────────────────────────
# Show the model's internal reasoning (<think>...</think>) in the TUI.
# Disabled by default — thinking tokens are stripped from history but can be
# displayed dimmed before the final answer. Useful for debugging reasoning models
# like Qwen3, DeepSeek R1, Gemma 4 (closes #48).
# SMALLCODE_SHOW_THINKING=true
#
# Cap thinking tokens stored in history (default 2000). Raise for complex tasks.
# SMALLCODE_THINKING_BUDGET=2000
# Disable thinking entirely (faster, less context used):
# SMALLCODE_THINKING_DISABLE=true

# ─── Web Browsing (disabled by default) ──────────────────────────────────────
# Enable browser-powered web search and page fetching
# Recommended for medium/large models (20B+) that can use web context effectively
# Uses Playwright with stealth mode for undetected browsing
# SMALLCODE_WEB_BROWSE=true

# ─── Escalation Limits ───────────────────────────────────────────────────────
SMALLCODE_ESCALATION_MAX=5
SMALLCODE_ESCALATION_CONFIRM=true