#!/usr/bin/env python3 """ Deep Search with Social Media Support Uses SearXNG + Crawl4AI for comprehensive web and social media search. """ import argparse import json import sys import urllib.parse import urllib.request from typing import List, Dict, Optional import subprocess import os # Configuration SEARXNG_URL = "http://10.0.0.8:8888" OLLAMA_URL = "http://10.0.0.10:11434" EMBED_MODEL = "nomic-embed-text" # Social media platforms SOCIAL_PLATFORMS = { 'x.com', 'twitter.com', 'facebook.com', 'fb.com', 'linkedin.com', 'instagram.com', 'reddit.com', 'youtube.com', 'youtu.be', 'threads.net', 'mastodon.social', 'mastodon', 'bsky.app', 'bluesky' } def search_searxng(query: str, max_results: int = 10, category: str = 'general') -> List[Dict]: """Search using local SearXNG instance.""" params = { 'q': query, 'format': 'json', 'pageno': 1, 'safesearch': 0, 'language': 'en', 'category': category } url = f"{SEARXNG_URL}/search?{urllib.parse.urlencode(params)}" try: req = urllib.request.Request(url, headers={'Accept': 'application/json'}) with urllib.request.urlopen(req, timeout=30) as response: data = json.loads(response.read().decode('utf-8')) return data.get('results', [])[:max_results] except Exception as e: print(f"Search error: {e}", file=sys.stderr) return [] def extract_content(url: str) -> Optional[str]: """Extract content from URL using Crawl4AI if available.""" try: # Try using crawl4ai import crawl4ai from crawl4ai import AsyncWebCrawler import asyncio async def crawl(): async with AsyncWebCrawler() as crawler: result = await crawler.arun(url=url) return result.markdown if result else None return asyncio.run(crawl()) except ImportError: # Fallback to simple fetch try: req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0' }) with urllib.request.urlopen(req, timeout=15) as response: return response.read().decode('utf-8', errors='ignore')[:5000] except Exception as e: return f"Error fetching content: {e}" def is_social_media(url: str) -> bool: """Check if URL is from a social media platform.""" url_lower = url.lower() for platform in SOCIAL_PLATFORMS: if platform in url_lower: return True return False def generate_embedding(text: str) -> Optional[List[float]]: """Generate embedding using local Ollama.""" try: import requests response = requests.post( f"{OLLAMA_URL}/api/embeddings", json={"model": EMBED_MODEL, "prompt": text[:8192]}, timeout=60 ) if response.status_code == 200: return response.json().get('embedding') return None except Exception as e: print(f"Embedding error: {e}", file=sys.stderr) return None def deep_search(query: str, max_urls: int = 5, social_only: bool = False) -> Dict: """Perform deep search with content extraction.""" results = { 'query': query, 'urls_searched': [], 'social_results': [], 'web_results': [], 'errors': [] } # Search search_results = search_searxng(query, max_results=max_urls * 2) for result in search_results[:max_urls]: url = result.get('url', '') title = result.get('title', '') snippet = result.get('content', '') if not url: continue is_social = is_social_media(url) if social_only and not is_social: continue # Extract full content full_content = extract_content(url) entry = { 'url': url, 'title': title, 'snippet': snippet, 'full_content': full_content[:3000] if full_content else None, 'is_social': is_social } if is_social: results['social_results'].append(entry) else: results['web_results'].append(entry) results['urls_searched'].append(url) return results def main(): parser = argparse.ArgumentParser(description='Deep Search with Social Media Support') parser.add_argument('query', help='Search query') parser.add_argument('--social', action='store_true', help='Include social media platforms') parser.add_argument('--social-only', action='store_true', help='Only search social media') parser.add_argument('--max-urls', type=int, default=8, help='Maximum URLs to fetch (default: 8)') parser.add_argument('--json', action='store_true', help='Output as JSON') args = parser.parse_args() print(f"šŸ” Deep Search: {args.query}") print(f" Social media: {'only' if args.social_only else ('yes' if args.social else 'no')}") print(f" Max URLs: {args.max_urls}") print("-" * 60) results = deep_search(args.query, max_urls=args.max_urls, social_only=args.social_only) if args.json: print(json.dumps(results, indent=2)) else: # Print formatted results if results['social_results']: print("\nšŸ“± SOCIAL MEDIA RESULTS:") for r in results['social_results']: print(f"\n 🌐 {r['url']}") print(f" Title: {r['title']}") print(f" Snippet: {r['snippet'][:200]}...") if results['web_results']: print("\n🌐 WEB RESULTS:") for r in results['web_results']: print(f"\n 🌐 {r['url']}") print(f" Title: {r['title']}") print(f" Snippet: {r['snippet'][:200]}...") print(f"\n{'='*60}") print(f"Total URLs searched: {len(results['urls_searched'])}") print(f"Social results: {len(results['social_results'])}") print(f"Web results: {len(results['web_results'])}") return 0 if __name__ == '__main__': sys.exit(main())