skills/deep-search/scripts/deep_search.py

#!/usr/bin/env python3
"""
Deep Search with Social Media Support
Uses SearXNG + Crawl4AI for comprehensive web and social media search.
"""

import argparse
import json
import sys
import urllib.parse
import urllib.request
from typing import List, Dict, Optional
import subprocess
import os

# Configuration
SEARXNG_URL = "http://10.0.0.8:8888"
OLLAMA_URL = "http://10.0.0.10:11434"
EMBED_MODEL = "nomic-embed-text"

# Social media platforms
SOCIAL_PLATFORMS = {
    'x.com', 'twitter.com',
    'facebook.com', 'fb.com',
    'linkedin.com',
    'instagram.com',
    'reddit.com',
    'youtube.com', 'youtu.be',
    'threads.net',
    'mastodon.social', 'mastodon',
    'bsky.app', 'bluesky'
}


def search_searxng(query: str, max_results: int = 10, category: str = 'general') -> List[Dict]:
    """Search using local SearXNG instance."""
    params = {
        'q': query,
        'format': 'json',
        'pageno': 1,
        'safesearch': 0,
        'language': 'en',
        'category': category
    }
    
    url = f"{SEARXNG_URL}/search?{urllib.parse.urlencode(params)}"
    
    try:
        req = urllib.request.Request(url, headers={'Accept': 'application/json'})
        with urllib.request.urlopen(req, timeout=30) as response:
            data = json.loads(response.read().decode('utf-8'))
            return data.get('results', [])[:max_results]
    except Exception as e:
        print(f"Search error: {e}", file=sys.stderr)
        return []


def extract_content(url: str) -> Optional[str]:
    """Extract content from URL using Crawl4AI if available."""
    try:
        # Try using crawl4ai
        import crawl4ai
        from crawl4ai import AsyncWebCrawler
        import asyncio
        
        async def crawl():
            async with AsyncWebCrawler() as crawler:
                result = await crawler.arun(url=url)
                return result.markdown if result else None
        
        return asyncio.run(crawl())
    except ImportError:
        # Fallback to simple fetch
        try:
            req = urllib.request.Request(url, headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0'
            })
            with urllib.request.urlopen(req, timeout=15) as response:
                return response.read().decode('utf-8', errors='ignore')[:5000]
        except Exception as e:
            return f"Error fetching content: {e}"


def is_social_media(url: str) -> bool:
    """Check if URL is from a social media platform."""
    url_lower = url.lower()
    for platform in SOCIAL_PLATFORMS:
        if platform in url_lower:
            return True
    return False


def generate_embedding(text: str) -> Optional[List[float]]:
    """Generate embedding using local Ollama."""
    try:
        import requests
        response = requests.post(
            f"{OLLAMA_URL}/api/embeddings",
            json={"model": EMBED_MODEL, "prompt": text[:8192]},
            timeout=60
        )
        if response.status_code == 200:
            return response.json().get('embedding')
        return None
    except Exception as e:
        print(f"Embedding error: {e}", file=sys.stderr)
        return None


def deep_search(query: str, max_urls: int = 5, social_only: bool = False) -> Dict:
    """Perform deep search with content extraction."""
    results = {
        'query': query,
        'urls_searched': [],
        'social_results': [],
        'web_results': [],
        'errors': []
    }
    
    # Search
    search_results = search_searxng(query, max_results=max_urls * 2)
    
    for result in search_results[:max_urls]:
        url = result.get('url', '')
        title = result.get('title', '')
        snippet = result.get('content', '')
        
        if not url:
            continue
        
        is_social = is_social_media(url)
        
        if social_only and not is_social:
            continue
        
        # Extract full content
        full_content = extract_content(url)
        
        entry = {
            'url': url,
            'title': title,
            'snippet': snippet,
            'full_content': full_content[:3000] if full_content else None,
            'is_social': is_social
        }
        
        if is_social:
            results['social_results'].append(entry)
        else:
            results['web_results'].append(entry)
        
        results['urls_searched'].append(url)
    
    return results


def main():
    parser = argparse.ArgumentParser(description='Deep Search with Social Media Support')
    parser.add_argument('query', help='Search query')
    parser.add_argument('--social', action='store_true', help='Include social media platforms')
    parser.add_argument('--social-only', action='store_true', help='Only search social media')
    parser.add_argument('--max-urls', type=int, default=8, help='Maximum URLs to fetch (default: 8)')
    parser.add_argument('--json', action='store_true', help='Output as JSON')
    
    args = parser.parse_args()
    
    print(f"🔍 Deep Search: {args.query}")
    print(f"   Social media: {'only' if args.social_only else ('yes' if args.social else 'no')}")
    print(f"   Max URLs: {args.max_urls}")
    print("-" * 60)
    
    results = deep_search(args.query, max_urls=args.max_urls, social_only=args.social_only)
    
    if args.json:
        print(json.dumps(results, indent=2))
    else:
        # Print formatted results
        if results['social_results']:
            print("\n📱 SOCIAL MEDIA RESULTS:")
            for r in results['social_results']:
                print(f"\n  🌐 {r['url']}")
                print(f"     Title: {r['title']}")
                print(f"     Snippet: {r['snippet'][:200]}...")
        
        if results['web_results']:
            print("\n🌐 WEB RESULTS:")
            for r in results['web_results']:
                print(f"\n  🌐 {r['url']}")
                print(f"     Title: {r['title']}")
                print(f"     Snippet: {r['snippet'][:200]}...")
        
        print(f"\n{'='*60}")
        print(f"Total URLs searched: {len(results['urls_searched'])}")
        print(f"Social results: {len(results['social_results'])}")
        print(f"Web results: {len(results['web_results'])}")
    
    return 0


if __name__ == '__main__':
    sys.exit(main())
Initial commit: workspace setup with skills, memory, config 2026-02-10 14:37:49 -06:00			`#!/usr/bin/env python3`
			`"""`
			`Deep Search with Social Media Support`
			`Uses SearXNG + Crawl4AI for comprehensive web and social media search.`
			`"""`

			`import argparse`
			`import json`
			`import sys`
			`import urllib.parse`
			`import urllib.request`
			`from typing import List, Dict, Optional`
			`import subprocess`
			`import os`

			`# Configuration`
			`SEARXNG_URL = "http://10.0.0.8:8888"`
			`OLLAMA_URL = "http://10.0.0.10:11434"`
			`EMBED_MODEL = "nomic-embed-text"`

			`# Social media platforms`
			`SOCIAL_PLATFORMS = {`
			`'x.com', 'twitter.com',`
			`'facebook.com', 'fb.com',`
			`'linkedin.com',`
			`'instagram.com',`
			`'reddit.com',`
			`'youtube.com', 'youtu.be',`
			`'threads.net',`
			`'mastodon.social', 'mastodon',`
			`'bsky.app', 'bluesky'`
			`}`


			`def search_searxng(query: str, max_results: int = 10, category: str = 'general') -> List[Dict]:`
			`"""Search using local SearXNG instance."""`
			`params = {`
			`'q': query,`
			`'format': 'json',`
			`'pageno': 1,`
			`'safesearch': 0,`
			`'language': 'en',`
			`'category': category`
			`}`

			`url = f"{SEARXNG_URL}/search?{urllib.parse.urlencode(params)}"`

			`try:`
			`req = urllib.request.Request(url, headers={'Accept': 'application/json'})`
			`with urllib.request.urlopen(req, timeout=30) as response:`
			`data = json.loads(response.read().decode('utf-8'))`
			`return data.get('results', [])[:max_results]`
			`except Exception as e:`
			`print(f"Search error: {e}", file=sys.stderr)`
			`return []`


			`def extract_content(url: str) -> Optional[str]:`
			`"""Extract content from URL using Crawl4AI if available."""`
			`try:`
			`# Try using crawl4ai`
			`import crawl4ai`
			`from crawl4ai import AsyncWebCrawler`
			`import asyncio`

			`async def crawl():`
			`async with AsyncWebCrawler() as crawler:`
			`result = await crawler.arun(url=url)`
			`return result.markdown if result else None`

			`return asyncio.run(crawl())`
			`except ImportError:`
			`# Fallback to simple fetch`
			`try:`
			`req = urllib.request.Request(url, headers={`
			`'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0'`
			`})`
			`with urllib.request.urlopen(req, timeout=15) as response:`
			`return response.read().decode('utf-8', errors='ignore')[:5000]`
			`except Exception as e:`
			`return f"Error fetching content: {e}"`


			`def is_social_media(url: str) -> bool:`
			`"""Check if URL is from a social media platform."""`
			`url_lower = url.lower()`
			`for platform in SOCIAL_PLATFORMS:`
			`if platform in url_lower:`
			`return True`
			`return False`


			`def generate_embedding(text: str) -> Optional[List[float]]:`
			`"""Generate embedding using local Ollama."""`
			`try:`
			`import requests`
			`response = requests.post(`
			`f"{OLLAMA_URL}/api/embeddings",`
			`json={"model": EMBED_MODEL, "prompt": text[:8192]},`
			`timeout=60`
			`)`
			`if response.status_code == 200:`
			`return response.json().get('embedding')`
			`return None`
			`except Exception as e:`
			`print(f"Embedding error: {e}", file=sys.stderr)`
			`return None`


			`def deep_search(query: str, max_urls: int = 5, social_only: bool = False) -> Dict:`
			`"""Perform deep search with content extraction."""`
			`results = {`
			`'query': query,`
			`'urls_searched': [],`
			`'social_results': [],`
			`'web_results': [],`
			`'errors': []`
			`}`

			`# Search`
			`search_results = search_searxng(query, max_results=max_urls * 2)`

			`for result in search_results[:max_urls]:`
			`url = result.get('url', '')`
			`title = result.get('title', '')`
			`snippet = result.get('content', '')`

			`if not url:`
			`continue`

			`is_social = is_social_media(url)`

			`if social_only and not is_social:`
			`continue`

			`# Extract full content`
			`full_content = extract_content(url)`

			`entry = {`
			`'url': url,`
			`'title': title,`
			`'snippet': snippet,`
			`'full_content': full_content[:3000] if full_content else None,`
			`'is_social': is_social`
			`}`

			`if is_social:`
			`results['social_results'].append(entry)`
			`else:`
			`results['web_results'].append(entry)`

			`results['urls_searched'].append(url)`

			`return results`


			`def main():`
			`parser = argparse.ArgumentParser(description='Deep Search with Social Media Support')`
			`parser.add_argument('query', help='Search query')`
			`parser.add_argument('--social', action='store_true', help='Include social media platforms')`
			`parser.add_argument('--social-only', action='store_true', help='Only search social media')`
			`parser.add_argument('--max-urls', type=int, default=8, help='Maximum URLs to fetch (default: 8)')`
			`parser.add_argument('--json', action='store_true', help='Output as JSON')`

			`args = parser.parse_args()`

			`print(f"🔍 Deep Search: {args.query}")`
			`print(f" Social media: {'only' if args.social_only else ('yes' if args.social else 'no')}")`
			`print(f" Max URLs: {args.max_urls}")`
			`print("-" * 60)`

			`results = deep_search(args.query, max_urls=args.max_urls, social_only=args.social_only)`

			`if args.json:`
			`print(json.dumps(results, indent=2))`
			`else:`
			`# Print formatted results`
			`if results['social_results']:`
			`print("\n📱 SOCIAL MEDIA RESULTS:")`
			`for r in results['social_results']:`
			`print(f"\n 🌐 {r['url']}")`
			`print(f" Title: {r['title']}")`
			`print(f" Snippet: {r['snippet'][:200]}...")`

			`if results['web_results']:`
			`print("\n🌐 WEB RESULTS:")`
			`for r in results['web_results']:`
			`print(f"\n 🌐 {r['url']}")`
			`print(f" Title: {r['title']}")`
			`print(f" Snippet: {r['snippet'][:200]}...")`

			`print(f"\n{'='*60}")`
			`print(f"Total URLs searched: {len(results['urls_searched'])}")`
			`print(f"Social results: {len(results['social_results'])}")`
			`print(f"Web results: {len(results['web_results'])}")`

			`return 0`


			`if __name__ == '__main__':`
			`sys.exit(main())`