#!/usr/bin/env python3
"""
Deep Search with Social Media Support
Uses SearXNG + Crawl4AI for comprehensive web and social media search.
"""

import argparse
import json
import sys
import urllib.parse
import urllib.request
from typing import List, Dict, Optional
import subprocess
import os

# Configuration
SEARXNG_URL = "http://10.0.0.8:8888"
OLLAMA_URL = "http://10.0.0.10:11434"
EMBED_MODEL = "nomic-embed-text"

# Social media platforms
SOCIAL_PLATFORMS = {
    'x.com', 'twitter.com',
    'facebook.com', 'fb.com',
    'linkedin.com',
    'instagram.com',
    'reddit.com',
    'youtube.com', 'youtu.be',
    'threads.net',
    'mastodon.social', 'mastodon',
    'bsky.app', 'bluesky'
}


def search_searxng(query: str, max_results: int = 10, category: str = 'general') -> List[Dict]:
    """Search using local SearXNG instance."""
    params = {
        'q': query,
        'format': 'json',
        'pageno': 1,
        'safesearch': 0,
        'language': 'en',
        'category': category
    }
    
    url = f"{SEARXNG_URL}/search?{urllib.parse.urlencode(params)}"
    
    try:
        req = urllib.request.Request(url, headers={'Accept': 'application/json'})
        with urllib.request.urlopen(req, timeout=30) as response:
            data = json.loads(response.read().decode('utf-8'))
            return data.get('results', [])[:max_results]
    except Exception as e:
        print(f"Search error: {e}", file=sys.stderr)
        return []


def extract_content(url: str) -> Optional[str]:
    """Extract content from URL using Crawl4AI if available."""
    try:
        # Try using crawl4ai
        import crawl4ai
        from crawl4ai import AsyncWebCrawler
        import asyncio
        
        async def crawl():
            async with AsyncWebCrawler() as crawler:
                result = await crawler.arun(url=url)
                return result.markdown if result else None
        
        return asyncio.run(crawl())
    except ImportError:
        # Fallback to simple fetch
        try:
            req = urllib.request.Request(url, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0'
            })
            with urllib.request.urlopen(req, timeout=15) as response:
                return response.read().decode('utf-8', errors='ignore')[:5000]
        except Exception as e:
            return f"Error fetching content: {e}"


def is_social_media(url: str) -> bool:
    """Check if URL is from a social media platform."""
    url_lower = url.lower()
    for platform in SOCIAL_PLATFORMS:
        if platform in url_lower:
            return True
    return False


def generate_embedding(text: str) -> Optional[List[float]]:
    """Generate embedding using local Ollama."""
    try:
        import requests
        response = requests.post(
            f"{OLLAMA_URL}/api/embeddings",
            json={"model": EMBED_MODEL, "prompt": text[:8192]},
            timeout=60
        )
        if response.status_code == 200:
            return response.json().get('embedding')
        return None
    except Exception as e:
        print(f"Embedding error: {e}", file=sys.stderr)
        return None


def deep_search(query: str, max_urls: int = 5, social_only: bool = False) -> Dict:
    """Perform deep search with content extraction."""
    results = {
        'query': query,
        'urls_searched': [],
        'social_results': [],
        'web_results': [],
        'errors': []
    }
    
    # Search
    search_results = search_searxng(query, max_results=max_urls * 2)
    
    for result in search_results[:max_urls]:
        url = result.get('url', '')
        title = result.get('title', '')
        snippet = result.get('content', '')
        
        if not url:
            continue
        
        is_social = is_social_media(url)
        
        if social_only and not is_social:
            continue
        
        # Extract full content
        full_content = extract_content(url)
        
        entry = {
            'url': url,
            'title': title,
            'snippet': snippet,
            'full_content': full_content[:3000] if full_content else None,
            'is_social': is_social
        }
        
        if is_social:
            results['social_results'].append(entry)
        else:
            results['web_results'].append(entry)
        
        results['urls_searched'].append(url)
    
    return results


def main():
    parser = argparse.ArgumentParser(description='Deep Search with Social Media Support')
    parser.add_argument('query', help='Search query')
    parser.add_argument('--social', action='store_true', help='Include social media platforms')
    parser.add_argument('--social-only', action='store_true', help='Only search social media')
    parser.add_argument('--max-urls', type=int, default=8, help='Maximum URLs to fetch (default: 8)')
    parser.add_argument('--json', action='store_true', help='Output as JSON')
    
    args = parser.parse_args()
    
    print(f"🔍 Deep Search: {args.query}")
    print(f"   Social media: {'only' if args.social_only else ('yes' if args.social else 'no')}")
    print(f"   Max URLs: {args.max_urls}")
    print("-" * 60)
    
    results = deep_search(args.query, max_urls=args.max_urls, social_only=args.social_only)
    
    if args.json:
        print(json.dumps(results, indent=2))
    else:
        # Print formatted results
        if results['social_results']:
            print("\n📱 SOCIAL MEDIA RESULTS:")
            for r in results['social_results']:
                print(f"\n  🌐 {r['url']}")
                print(f"     Title: {r['title']}")
                print(f"     Snippet: {r['snippet'][:200]}...")
        
        if results['web_results']:
            print("\n🌐 WEB RESULTS:")
            for r in results['web_results']:
                print(f"\n  🌐 {r['url']}")
                print(f"     Title: {r['title']}")
                print(f"     Snippet: {r['snippet'][:200]}...")
        
        print(f"\n{'='*60}")
        print(f"Total URLs searched: {len(results['urls_searched'])}")
        print(f"Social results: {len(results['social_results'])}")
        print(f"Web results: {len(results['web_results'])}")
    
    return 0


if __name__ == '__main__':
    sys.exit(main())