v2.0.2: Production release with role parsing fix and threshold correction

fix: parse curated turns into proper user/assistant roles
- Added parse_curated_turn() function to correctly parse stored memories - Fixed build_augmented_messages() to use proper message roles - Layer 2 (semantic) and Layer 3 (context) now correctly parse User: X / Assistant: Y format into separate messages - Resolves context corruption where turns were dumped as single user message v2.0.2
2026-03-27 13:42:22 -05:00 · 2026-03-27 13:19:08 -05:00
4 changed files with 135 additions and 55 deletions
--- a/38
+++ b/38
@@ -4,15 +4,6 @@
 # Build arguments:
 #   APP_UID: User ID for appuser (default: 999)
 #   APP_GID: Group ID for appgroup (default: 999)
-#
-# Build example:
-#   docker build --build-arg APP_UID=1000 --build-arg APP_GID=1000 -t vera-ai .
-#
-# Runtime environment variables:
-#   TZ: Timezone (default: UTC)
-#   APP_UID: User ID (informational)
-#   APP_GID: Group ID (informational)
-#   VERA_LOG_DIR: Debug log directory (default: /app/logs)

 # Stage 1: Builder
 FROM python:3.11-slim AS builder
@@ -20,9 +11,7 @@ FROM python:3.11-slim AS builder
 WORKDIR /app

 # Install build dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
-    && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y --no-install-recommends     build-essential     && rm -rf /var/lib/apt/lists/*

 # Copy requirements and install
 COPY requirements.txt .
@@ -38,29 +27,25 @@ ARG APP_UID=999
 ARG APP_GID=999

 # Create group and user with specified UID/GID
-RUN groupadd -g ${APP_GID} appgroup && \
-    useradd -u ${APP_UID} -g appgroup -r -m -s /bin/bash appuser
+RUN groupadd -g ${APP_GID} appgroup &&     useradd -u ${APP_UID} -g appgroup -r -m -s /bin/bash appuser

 # Copy installed packages from builder
 COPY --from=builder /root/.local /home/appuser/.local
 ENV PATH=/home/appuser/.local/bin:$PATH

 # Create directories for mounted volumes
-RUN mkdir -p /app/config /app/prompts /app/static /app/logs && \
-    chown -R ${APP_UID}:${APP_GID} /app
+RUN mkdir -p /app/config /app/prompts /app/logs &&     chown -R ${APP_UID}:${APP_GID} /app

 # Copy application code
 COPY app/ ./app/

 # Copy default config and prompts (can be overridden by volume mounts)
-COPY config.toml /app/config/config.toml
-COPY static/curator_prompt.md /app/prompts/curator_prompt.md
-COPY static/systemprompt.md /app/prompts/systemprompt.md
+COPY config/config.toml /app/config/config.toml
+COPY prompts/curator_prompt.md /app/prompts/curator_prompt.md
+COPY prompts/systemprompt.md /app/prompts/systemprompt.md

-# Create symlinks for backward compatibility
-RUN ln -sf /app/config/config.toml /app/config.toml && \
-    ln -sf /app/prompts/curator_prompt.md /app/static/curator_prompt.md && \
-    ln -sf /app/prompts/systemprompt.md /app/static/systemprompt.md
+# Create symlink for config backward compatibility
+RUN ln -sf /app/config/config.toml /app/config.toml

 # Set ownership
 RUN chown -R ${APP_UID}:${APP_GID} /app && chmod -R u+rw /app
@@ -70,11 +55,10 @@ ENV TZ=UTC

 EXPOSE 11434

-# Health check using Python (no curl needed in slim image)
-HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:11434/')" || exit 1
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3     CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:11434/')" || exit 1

 # Switch to non-root user
 USER appuser

-CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "11434"]"
+ENTRYPOINT ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "11434"]
--- a/app/utils.py
+++ b/app/utils.py
@@ -2,7 +2,7 @@
 from .config import config
 import tiktoken
 import os
-from typing import List, Dict
+from typing import List, Dict, Optional
 from datetime import datetime, timedelta
 from pathlib import Path

@@ -127,10 +127,70 @@ def load_system_prompt() -> str:
        return ""


+def parse_curated_turn(text: str) -> List[Dict]:
+    """Parse a curated turn into alternating user/assistant messages.
+    
+    Input format:
+        User: [question]
+        Assistant: [answer]
+        Timestamp: ISO datetime
+    
+    Returns list of message dicts with role and content.
+    Returns empty list if parsing fails.
+    """
+    if not text:
+        return []
+    
+    messages = []
+    lines = text.strip().split("\n")
+    
+    current_role = None
+    current_content = []
+    
+    for line in lines:
+        line = line.strip()
+        if line.startswith("User:"):
+            # Save previous content if exists
+            if current_role and current_content:
+                messages.append({
+                    "role": current_role,
+                    "content": "\n".join(current_content).strip()
+                })
+            current_role = "user"
+            current_content = [line[5:].strip()]  # Remove "User:" prefix
+        elif line.startswith("Assistant:"):
+            # Save previous content if exists
+            if current_role and current_content:
+                messages.append({
+                    "role": current_role,
+                    "content": "\n".join(current_content).strip()
+                })
+            current_role = "assistant"
+            current_content = [line[10:].strip()]  # Remove "Assistant:" prefix
+        elif line.startswith("Timestamp:"):
+            # Ignore timestamp line
+            continue
+        elif current_role:
+            # Continuation of current message
+            current_content.append(line)
+    
+    # Save last message
+    if current_role and current_content:
+        messages.append({
+            "role": current_role,
+            "content": "\n".join(current_content).strip()
+        })
+    
+    return messages
+
+
 async def build_augmented_messages(incoming_messages: List[Dict]) -> List[Dict]:
    """Build 4-layer augmented messages from incoming messages.
    
-    This is a standalone version that can be used by proxy_handler.py.
+    Layer 1: System prompt (preserved from incoming + vera context)
+    Layer 2: Semantic memories (curated, parsed into proper roles)
+    Layer 3: Recent context (raw turns, parsed into proper roles)
+    Layer 4: Current conversation (passed through)
    """
    import logging
    
@@ -153,6 +213,10 @@ async def build_augmented_messages(incoming_messages: List[Dict]) -> List[Dict]:
            search_context += msg.get("content", "") + " "
    
    messages = []
+    token_budget = {
+        "semantic": config.semantic_token_budget,
+        "context": config.context_token_budget
+    }
    
    # === LAYER 1: System Prompt ===
    system_content = ""
@@ -166,6 +230,7 @@ async def build_augmented_messages(incoming_messages: List[Dict]) -> List[Dict]:
    
    if system_content:
        messages.append({"role": "system", "content": system_content})
+        logger.info(f"Layer 1 (system): {count_tokens(system_content)} tokens")
    
    # === LAYER 2: Semantic (curated memories) ===
    qdrant = get_qdrant_service()
@@ -176,28 +241,71 @@ async def build_augmented_messages(incoming_messages: List[Dict]) -> List[Dict]:
        entry_type="curated"
    )
    
-    semantic_tokens = 0
+    semantic_messages = []
+    semantic_tokens_used = 0
+    
    for result in semantic_results:
        payload = result.get("payload", {})
        text = payload.get("text", "")
-        if text and semantic_tokens < config.semantic_token_budget:
-            messages.append({"role": "user", "content": text})  # Add as context
-            semantic_tokens += count_tokens(text)
+        if text:
+            # Parse curated turn into proper user/assistant messages
+            parsed = parse_curated_turn(text)
+            for msg in parsed:
+                msg_tokens = count_tokens(msg.get("content", ""))
+                if semantic_tokens_used + msg_tokens <= token_budget["semantic"]:
+                    semantic_messages.append(msg)
+                    semantic_tokens_used += msg_tokens
+                else:
+                    break
+        if semantic_tokens_used >= token_budget["semantic"]:
+            break
+    
+    # Add parsed messages to context
+    for msg in semantic_messages:
+        messages.append(msg)
+    
+    if semantic_messages:
+        logger.info(f"Layer 2 (semantic): {len(semantic_messages)} messages, ~{semantic_tokens_used} tokens")
    
    # === LAYER 3: Context (recent turns) ===
-    recent_turns = await qdrant.get_recent_turns(limit=20)
+    recent_turns = await qdrant.get_recent_turns(limit=50)
    
-    context_tokens = 0
+    context_messages = []
+    context_tokens_used = 0
+    
+    # Process oldest first for chronological order
    for turn in reversed(recent_turns):
        payload = turn.get("payload", {})
        text = payload.get("text", "")
-        if text and context_tokens < config.context_token_budget:
-            messages.append({"role": "user", "content": text})  # Add as context
-            context_tokens += count_tokens(text)
+        entry_type = payload.get("type", "raw")
        
-    # === LAYER 4: Current messages (passed through) ===
+        if text:
+            # Parse turn into messages
+            parsed = parse_curated_turn(text)
+            
+            for msg in parsed:
+                msg_tokens = count_tokens(msg.get("content", ""))
+                if context_tokens_used + msg_tokens <= token_budget["context"]:
+                    context_messages.append(msg)
+                    context_tokens_used += msg_tokens
+                else:
+                    break
+        
+        if context_tokens_used >= token_budget["context"]:
+            break
+    
+    # Add context messages (oldest first maintains conversation order)
+    for msg in context_messages:
+        messages.append(msg)
+    
+    if context_messages:
+        logger.info(f"Layer 3 (context): {len(context_messages)} messages, ~{context_tokens_used} tokens")
+    
+    # === LAYER 4: Current conversation ===
    for msg in incoming_messages:
-        if msg.get("role") != "system":  # Do not duplicate system
+        if msg.get("role") != "system":  # System already handled in Layer 1
            messages.append(msg)
    
+    logger.info(f"Layer 4 (current): {len([m for m in incoming_messages if m.get('role') != 'system'])} messages")
+    
    return messages
--- a/config/config.toml
+++ b/config/config.toml
@@ -2,18 +2,15 @@
 ollama_host = "http://10.0.0.10:11434"
 qdrant_host = "http://10.0.0.22:6333"
 qdrant_collection = "memories"
-embedding_model = "snowflake-arctic-embed2"
+embedding_model = "mxbai-embed-large"
 debug = false

 [layers]
-# Note: system_token_budget removed - system prompt is never truncated
 semantic_token_budget = 25000
 context_token_budget = 22000
 semantic_search_turns = 2
-semantic_score_threshold = 0.6
+semantic_score_threshold = 0.3

 [curator]
-# Daily curation: processes recent 24h of raw memories
-# Monthly mode is detected automatically by curator_prompt.md (day 01)
 run_time = "02:00"
 curator_model = "gpt-oss:120b"
--- a/prompts/systemprompt.md
+++ b/prompts/systemprompt.md
@@ -1,10 +1 @@
-You have persistent memory across all conversations with this user.

-**Important:** The latter portion of your conversation context contains memories retrieved from a vector database. These are curated summaries of past conversations, not live chat history.
-
-Use these memories to:
- Reference previous decisions and preferences
- Draw on relevant past discussions
- Provide personalized, context-aware responses
-
-If memories seem outdated or conflicting, ask for clarification.