#!/usr/bin/env python3
"""
Monitor OpenClaw GitHub repo for relevant updates
Only outputs/announces when there are significant changes affecting our setup.
Always exits with code 0 to prevent "exec failed" logs.
Usage: monitor_openclaw_repo.py [--json]
"""

import argparse
import sys
import json
import urllib.request
import re
import hashlib
from datetime import datetime

QDRANT_URL = "http://10.0.0.40:6333"
KB_COLLECTION = "knowledge_base"

# Keywords that indicate relevance to our setup
RELEVANT_KEYWORDS = [
    "ollama", "model", "embedding", "llm", "ai",
    "telegram", "webchat", "signal", "discord",
    "skill", "skills", "qdrant", "memory", "search",
    "whisper", "tts", "voice", "cron",
    "gateway", "agent", "session", "vector",
    "browser", "exec", "read", "edit", "write",
    "breaking", "deprecated", "removed", "changed",
    "fix", "bug", "patch", "security", "vulnerability"
]

HIGH_PRIORITY_AREAS = [
    "ollama", "telegram", "qdrant", "memory", "skills",
    "voice", "cron", "gateway", "browser"
]

def fetch_github_api(url):
    headers = {
        'User-Agent': 'OpenClaw-KB-Monitor',
        'Accept': 'application/vnd.github.v3+json'
    }
    req = urllib.request.Request(url, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=20) as response:
            return json.loads(response.read().decode())
    except Exception as e:
        return None

def fetch_github_html(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
    req = urllib.request.Request(url, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=20) as response:
            html = response.read().decode('utf-8', errors='ignore')
            text = re.sub(r'<script[^>]*>.*?</script>', ' ', html, flags=re.DOTALL | re.IGNORECASE)
            text = re.sub(r'<style[^>]*>.*?</style>', ' ', text, flags=re.DOTALL | re.IGNORECASE)
            text = re.sub(r'<[^>]+>', ' ', text)
            text = re.sub(r'\s+', ' ', text).strip()
            return text[:5000]
    except:
        return None

def get_embedding(text):
    import json as jsonlib
    data = {"model": "nomic-embed-text", "input": text[:1000]}
    req = urllib.request.Request(
        "http://localhost:11434/api/embed",
        data=jsonlib.dumps(data).encode(),
        headers={"Content-Type": "application/json"},
        method="POST"
    )
    try:
        with urllib.request.urlopen(req, timeout=30) as response:
            result = jsonlib.loads(response.read().decode())
            return result.get("embeddings", [None])[0]
    except:
        return None

def search_kb_by_path(path_prefix):
    url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/scroll"
    data = {"limit": 100, "with_payload": True}
    req = urllib.request.Request(url, data=json.dumps(data).encode(),
                                  headers={"Content-Type": "application/json"}, method="POST")
    try:
        with urllib.request.urlopen(req, timeout=10) as response:
            result = json.loads(response.read().decode())
            points = result.get("result", {}).get("points", [])
            return [p for p in points if p.get("payload", {}).get("path", "").startswith(path_prefix)]
    except:
        return []

def store_in_kb(text, metadata):
    import uuid
    embedding = get_embedding(text)
    if not embedding:
        return None
    metadata["checksum"] = f"sha256:{hashlib.sha256(text.encode()).hexdigest()[:16]}"
    metadata["date_scraped"] = datetime.now().isoformat()
    metadata["text_preview"] = text[:300] + "..." if len(text) > 300 else text
    point = {"id": str(uuid.uuid4()), "vector": embedding, "payload": metadata}
    url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points"
    req = urllib.request.Request(url, data=json.dumps({"points": [point]}).encode(),
                                  headers={"Content-Type": "application/json"}, method="PUT")
    try:
        with urllib.request.urlopen(req, timeout=10) as response:
            result = json.loads(response.read().decode())
            return result.get("status") == "ok"
    except:
        return False

def delete_kb_entry(entry_id):
    url = f"{QDRANT_URL}/collections/{KB_COLLECTION}/points/delete"
    data = {"points": [entry_id]}
    req = urllib.request.Request(url, data=json.dumps(data).encode(),
                                  headers={"Content-Type": "application/json"}, method="POST")
    try:
        with urllib.request.urlopen(req, timeout=10) as response:
            result = json.loads(response.read().decode())
            return result.get("status") == "ok"
    except:
        return False

def is_relevant_change(text):
    text_lower = text.lower()
    found_keywords = [kw for kw in RELEVANT_KEYWORDS if kw in text_lower]
    high_priority_found = [area for area in HIGH_PRIORITY_AREAS if area in text_lower]
    return {
        "relevant": len(found_keywords) > 0,
        "keywords": found_keywords,
        "high_priority": high_priority_found,
        "score": len(found_keywords) + (len(high_priority_found) * 2)
    }

def evaluate_significance(changes):
    total_score = sum(c["analysis"]["score"] for c in changes)
    high_priority_count = sum(len(c["analysis"]["high_priority"]) for c in changes)
    return {
        "significant": total_score >= 3 or high_priority_count > 0,
        "total_score": total_score,
        "high_priority_count": high_priority_count
    }

def format_summary(changes, significance):
    lines = ["📊 OpenClaw Repo Update", f"📅 {datetime.now().strftime('%Y-%m-%d')}", ""]
    by_section = {}
    for change in changes:
        section = change["section"]
        if section not in by_section:
            by_section[section] = []
        by_section[section].append(change)
    
    for section, items in by_section.items():
        lines.append(f"📁 {section}")
        for item in items[:3]:
            title = item["title"][:50] + "..." if len(item["title"]) > 50 else item["title"]
            lines.append(f"   • {title}")
            if item["analysis"]["high_priority"]:
                lines.append(f"     ⚠️  Affects: {', '.join(item['analysis']['high_priority'][:2])}")
        if len(items) > 3:
            lines.append(f"   ... and {len(items) - 3} more")
        lines.append("")
    return "\n".join(lines)

def scrape_all_sections():
    sections = []
    main_text = fetch_github_html("https://github.com/openclaw/openclaw")
    if main_text:
        sections.append({"section": "Main Repo", "title": "openclaw/openclaw README",
                         "url": "https://github.com/openclaw/openclaw", "content": main_text})
    
    releases = fetch_github_api("https://api.github.com/repos/openclaw/openclaw/releases?per_page=5")
    if releases:
        for release in releases:
            sections.append({"section": "Release", "title": release.get("name", release.get("tag_name", "Unknown")),
                             "url": release.get("html_url", ""), "content": release.get("body", "")[:2000],
                             "published": release.get("published_at", "")})
    
    issues = fetch_github_api("https://api.github.com/repos/openclaw/openclaw/issues?state=open&per_page=5")
    if issues:
        for issue in issues:
            if "pull_request" not in issue:
                sections.append({"section": "Issue", "title": issue.get("title", "Unknown"),
                                 "url": issue.get("html_url", ""), "content": issue.get("body", "")[:1500] if issue.get("body") else "No description",
                                 "labels": [l.get("name", "") for l in issue.get("labels", [])]})
    return sections

def check_and_update():
    sections = scrape_all_sections()
    if not sections:
        return None, "No data scraped"
    
    existing_entries = search_kb_by_path("OpenClaw/GitHub")
    existing_checksums = {e.get("payload", {}).get("checksum", ""): e for e in existing_entries}
    changes_detected = []
    
    for section in sections:
        content = section["content"]
        if not content:
            continue
        checksum = f"sha256:{hashlib.sha256(content.encode()).hexdigest()[:16]}"
        if checksum in existing_checksums:
            continue
        
        analysis = is_relevant_change(content + " " + section["title"])
        section["analysis"] = analysis
        section["checksum"] = checksum
        changes_detected.append(section)
        
        for old_checksum, old_entry in existing_checksums.items():
            if old_entry.get("payload", {}).get("title", "") == section["title"]:
                delete_kb_entry(old_entry.get("id"))
                break
        
        metadata = {
            "domain": "OpenClaw", "path": f"OpenClaw/GitHub/{section['section']}/{section['title'][:30]}",
            "subjects": ["openclaw", "github", section['section'].lower()], "category": "reference",
            "content_type": "web_page", "title": section["title"], "source_url": section["url"],
            "date_added": datetime.now().strftime("%Y-%m-%d")
        }
        store_in_kb(content, metadata)
    
    if changes_detected:
        significance = evaluate_significance(changes_detected)
        if significance["significant"]:
            return {"changes": changes_detected, "significance": significance,
                    "summary": format_summary(changes_detected, significance)}, None
        else:
            return None, "Changes not significant"
    return None, "No changes detected"

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--json", action="store_true")
    args = parser.parse_args()
    
    result, reason = check_and_update()
    
    # Always output JSON for cron compatibility, even if empty
    if args.json:
        print(json.dumps(result if result else {}))
    elif result:
        print(result["summary"])
    # If no result, output nothing (silent)
    
    # Always exit 0 to prevent "exec failed" logs
    sys.exit(0)

if __name__ == "__main__":
    main()