Initial commit: Vera-AI v2 with async Qdrant, singleton pattern, monthly curation, and configurable UID/GID/TZ

Features: - AsyncQdrantClient for non-blocking Qdrant operations - Singleton pattern for QdrantService - Monthly full curation (day 1 at 03:00) - Configurable UID/GID for Docker - Timezone support via TZ env var - Configurable log directory (VERA_LOG_DIR) - Volume mounts for config/, prompts/, logs/ - Standard Docker format with .env file Fixes: - Removed unused system_token_budget - Added semantic_score_threshold config - Fixed streaming response handling - Python-based healthcheck (no curl dependency)
2026-03-26 12:37:25 -05:00
commit 50593e200d
21 changed files with 1916 additions and 0 deletions
--- a/app/proxy_handler.py
+++ b/app/proxy_handler.py
@@ -0,0 +1,215 @@
+# app/proxy_handler.py
+from fastapi import Request
+from fastapi.responses import JSONResponse, StreamingResponse
+import httpx
+import json
+import re
+import logging
+import os
+from pathlib import Path
+from .config import config
+from .singleton import get_qdrant_service
+from .utils import count_tokens, build_augmented_messages
+
+logger = logging.getLogger(__name__)
+
+# Debug log directory (configurable via environment)
+# Logs are written to VERA_LOG_DIR or /app/logs by default
+DEBUG_LOG_DIR = Path(os.environ.get("VERA_LOG_DIR", "/app/logs"))
+
+
+def clean_message_content(content: str) -> str:
+    """Strip [Memory context] wrapper and extract actual user message."""
+    if not content:
+        return content
+    
+    # Check for OpenJarvis/OpenClaw wrapper
+    wrapper_match = re.search(
+        r'\[Memory context\].*?- user_msg:\s*(.+?)(?:\n\n|\Z)',
+        content, re.DOTALL
+    )
+    if wrapper_match:
+        return wrapper_match.group(1).strip()
+    
+    # Also strip timestamp prefixes if present
+    ts_match = re.match(r'\[\d{4}-\d{2}-\d{2}[^\]]*\]\s*', content)
+    if ts_match:
+        return content[ts_match.end():].strip()
+    
+    return content
+
+
+def debug_log(category: str, message: str, data: dict = None):
+    """Append a debug entry to the daily debug log if debug mode is enabled.
+    
+    Logs are written to VERA_LOG_DIR/debug_YYYY-MM-DD.log
+    This ensures logs are persisted and easily accessible.
+    """
+    if not config.debug:
+        return
+    
+    from datetime import datetime
+    
+    # Create logs directory
+    log_dir = DEBUG_LOG_DIR
+    log_dir.mkdir(parents=True, exist_ok=True)
+    
+    today = datetime.utcnow().strftime("%Y-%m-%d")
+    log_path = log_dir / f"debug_{today}.log"
+    
+    entry = {
+        "timestamp": datetime.utcnow().isoformat() + "Z",
+        "category": category,
+        "message": message
+    }
+    if data:
+        entry["data"] = data
+    
+    with open(log_path, "a") as f:
+        f.write(json.dumps(entry) + "\n")
+
+
+async def handle_chat_non_streaming(body: dict):
+    """Handle non-streaming chat request."""
+    incoming_messages = body.get("messages", [])
+    model = body.get("model", "")
+    
+    debug_log("INPUT", "Non-streaming chat request", {"messages": incoming_messages})
+    
+    # Clean messages
+    cleaned_messages = []
+    for msg in incoming_messages:
+        if msg.get("role") == "user":
+            cleaned_content = clean_message_content(msg.get("content", ""))
+            cleaned_messages.append({"role": "user", "content": cleaned_content})
+        else:
+            cleaned_messages.append(msg)
+    
+    # Build augmented messages
+    augmented_messages = await build_augmented_messages(cleaned_messages)
+    
+    debug_log("THOUGHT", "Built augmented messages", {"augmented_count": len(augmented_messages)})
+    
+    # Forward to Ollama
+    forwarded_body = body.copy()
+    forwarded_body["messages"] = augmented_messages
+    
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        resp = await client.post(f"{config.ollama_host}/api/chat", json=forwarded_body)
+        result = resp.json()
+    
+    debug_log("OUTPUT", "LLM non-streaming response", {"response": result})
+    
+    # Store the Q&A turn
+    user_question = ""
+    for msg in reversed(incoming_messages):
+        if msg.get("role") == "user":
+            user_question = clean_message_content(msg.get("content", ""))
+            break
+    
+    assistant_answer = result.get("message", {}).get("content", "")
+    
+    if user_question and assistant_answer:
+        qdrant_service = get_qdrant_service()
+        try:
+            result_id = await qdrant_service.store_qa_turn(user_question, assistant_answer)
+            debug_log("STORAGE", "Non-streaming Q&A stored", {"question": user_question, "answer": assistant_answer})
+        except Exception as e:
+            logger.error(f"[STORE] FAILED: {e}")
+    
+    result["model"] = model
+    return JSONResponse(content=result)
+
+
+async def handle_chat(request: Request):
+    """Handle streaming chat request."""
+    body = await request.json()
+    incoming_messages = body.get("messages", [])
+    model = body.get("model", "")
+    
+    debug_log("INPUT", "Streaming chat request", {"messages": incoming_messages})
+    
+    # Clean messages
+    cleaned_messages = []
+    for msg in incoming_messages:
+        if msg.get("role") == "user":
+            cleaned_content = clean_message_content(msg.get("content", ""))
+            cleaned_messages.append({"role": "user", "content": cleaned_content})
+        else:
+            cleaned_messages.append(msg)
+    
+    # Build augmented messages
+    augmented_messages = await build_augmented_messages(cleaned_messages)
+    
+    debug_log("THOUGHT", "Built augmented messages for streaming", {
+        "original_count": len(incoming_messages),
+        "augmented_count": len(augmented_messages)
+    })
+    
+    # Forward to Ollama with streaming
+    forwarded_body = body.copy()
+    forwarded_body["messages"] = augmented_messages
+    
+    headers = dict(request.headers)
+    headers.pop("content-length", None)
+    headers.pop("Content-Length", None)
+    headers.pop("content-type", None)
+    headers.pop("Content-Type", None)
+    
+    async def stream_response():
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            resp = await client.post(
+                f"{config.ollama_host}/api/chat",
+                json=forwarded_body,
+                headers=headers
+            )
+            
+            full_content = ""
+            async for chunk in resp.aiter_bytes():
+                yield chunk
+                
+                for line in chunk.decode().strip().splitlines():
+                    try:
+                        data = json.loads(line)
+                        if "message" in data and "content" in data["message"]:
+                            full_content += data["message"]["content"]
+                    except json.JSONDecodeError:
+                        pass
+            
+            debug_log("OUTPUT", "LLM streaming response complete", {
+                "content_length": len(full_content)
+            })
+            
+            # Store Q&A turn
+            user_question = ""
+            for msg in reversed(incoming_messages):
+                if msg.get("role") == "user":
+                    user_question = clean_message_content(msg.get("content", ""))
+                    break
+            
+            if user_question and full_content:
+                qdrant_service = get_qdrant_service()
+                try:
+                    result_id = await qdrant_service.store_qa_turn(user_question, full_content)
+                    logger.info(f"[STORE] Success! ID={result_id[:8]}, Q={len(user_question)} chars")
+                except Exception as e:
+                    logger.error(f"[STORE] FAILED: {type(e).__name__}: {e}")
+    
+    return StreamingResponse(stream_response(), media_type="application/x-ndjson")
+
+
+async def forward_to_ollama(request: Request, path: str):
+    """Forward request to Ollama transparently."""
+    body = await request.body()
+    headers = dict(request.headers)
+    headers.pop("content-length", None)
+    headers.pop("Content-Length", None)
+    
+    async with httpx.AsyncClient(timeout=300.0) as client:
+        resp = await client.request(
+            method=request.method,
+            url=f"{config.ollama_host}{path}",
+            content=body,
+            headers=headers
+        )
+        return resp