-
Notifications
You must be signed in to change notification settings - Fork 346
Expand file tree
/
Copy pathapi.py
More file actions
152 lines (130 loc) · 5.67 KB
/
Copy pathapi.py
File metadata and controls
152 lines (130 loc) · 5.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""FastAPI app wiring Copilot onto the OpenAI Chat Completions API."""
import threading
import time
from fastapi import FastAPI
from fastapi.responses import JSONResponse, StreamingResponse
from copilot import CopilotClient
from copilot.driver import ClearanceRequired
from .config import MODEL_NAME, RATE_LIMIT_BURST, RATE_LIMIT_RPM
from .openai_format import (
completion_response,
new_id,
sse_event,
stream_chunk,
)
from .prompt import messages_to_prompt
from .ratelimit import TokenBucket
from .schemas import ChatCompletionRequest
app = FastAPI(title="Copilot OpenAI-compatible API", version="1.0.0")
# Server runs headless and must never pop a visible browser mid-request. With
# both recovery passes disabled, an expired clearance surfaces immediately as a
# 503 (see ClearanceRequired handling below) so an operator can re-clear out of
# band (`python -m copilot login`). Headless auto-solve is intentionally off:
# it's unreliable on low-trust egress and a failed pass can wedge the session.
client = CopilotClient(interactive_clear=False, headless_clear=False)
_CLEARANCE_HELP = (
"Cloudflare clearance expired and could not be refreshed headlessly. "
"Re-clear in a browser: run `python -m copilot login` (or `python tests/diagnostic.py`) "
"and pass the 'verify you're human' check, then retry."
)
# Self-imposed rate limit on top of the concurrency lock below: this caps
# requests-per-minute, the lock caps requests-in-flight. See server/ratelimit.py.
_rate_limiter = TokenBucket(RATE_LIMIT_RPM, RATE_LIMIT_BURST)
def _rate_limited_response():
"""Spend a token; return an OpenAI-shaped 429 if none left, else ``None``."""
allowed, wait = _rate_limiter.try_acquire()
if allowed:
return None
secs = max(1, round(wait))
return JSONResponse(
status_code=429,
headers={"Retry-After": str(secs)},
content={"error": {
"message": (
f"Rate limit exceeded (>{RATE_LIMIT_RPM:g} req/min). "
f"Retry in {secs}s."
),
"type": "rate_limit_error",
"code": "rate_limit_exceeded",
}},
)
# Copilot's per-account chat socket doesn't tolerate concurrent conversations
# from one process (parallel requests error out or hang). This server bridges a
# single signed-in account, so we serialize upstream calls: concurrent HTTP
# requests queue here and run one at a time. Predictable, at the cost of
# parallelism — fine for a personal bridge.
_upstream_lock = threading.Lock()
def _stream(prompt: str, model: str, conversation_id=None):
"""Yield OpenAI ``chat.completion.chunk`` SSE events for ``prompt``.
``conversation_id`` continues an existing Copilot thread; ``None`` starts a
fresh one (its id is emitted on the final chunk).
"""
cid = new_id()
created = int(time.time())
try:
with _upstream_lock: # one upstream chat at a time (released on disconnect)
yield sse_event(stream_chunk(cid, created, model, {"role": "assistant"}))
stream = client.stream(prompt, conversation_id=conversation_id)
for piece in stream:
if isinstance(piece, str) and piece:
yield sse_event(stream_chunk(cid, created, model, {"content": piece}))
# Copilot's conversation id is known once the stream has run; emit it
# on the final chunk so callers can track the upstream thread.
yield sse_event(
stream_chunk(
cid, created, model, {}, finish="stop",
conversation_id=stream.conversation_id,
)
)
except ClearanceRequired:
yield sse_event(
stream_chunk(cid, created, model, {"content": f"\n[error: {_CLEARANCE_HELP}]"}, finish="error")
)
except Exception as exc: # surface errors to the client instead of hanging
yield sse_event(
stream_chunk(cid, created, model, {"content": f"\n[error: {exc}]"}, finish="error")
)
yield "data: [DONE]\n\n"
@app.get("/v1/models")
def list_models():
return {
"object": "list",
"data": [
{"id": MODEL_NAME, "object": "model", "created": 0, "owned_by": "microsoft"}
],
}
@app.post("/v1/chat/completions")
def chat_completions(req: ChatCompletionRequest):
prompt = messages_to_prompt(req.messages)
if not prompt.strip():
return JSONResponse(
status_code=400,
content={"error": {"message": "no text content in messages", "type": "invalid_request_error"}},
)
model = req.model or MODEL_NAME
# Enforce the per-minute ceiling before touching the upstream lock, so excess
# callers get a fast 429 instead of piling up behind the serialized queue.
limited = _rate_limited_response()
if limited is not None:
return limited
if req.stream:
return StreamingResponse(
_stream(prompt, model, req.conversation_id), media_type="text/event-stream"
)
try:
with _upstream_lock: # serialize: one upstream chat at a time
reply = client.chat(prompt, conversation_id=req.conversation_id)
except ClearanceRequired:
return JSONResponse(
status_code=503,
content={"error": {"message": _CLEARANCE_HELP, "type": "clearance_required"}},
)
except Exception as exc:
return JSONResponse(
status_code=502,
content={"error": {"message": str(exc), "type": "upstream_error"}},
)
return completion_response(reply.text, model, reply.conversation_id)
@app.get("/")
def root():
return {"service": "Copilot OpenAI-compatible API", "endpoints": ["/v1/models", "/v1/chat/completions"]}