202 lines
6.2 KiB
Python
202 lines
6.2 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Deep Search with Social Media Support
|
||
|
|
Uses SearXNG + Crawl4AI for comprehensive web and social media search.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
import urllib.parse
|
||
|
|
import urllib.request
|
||
|
|
from typing import List, Dict, Optional
|
||
|
|
import subprocess
|
||
|
|
import os
|
||
|
|
|
||
|
|
# Configuration
|
||
|
|
SEARXNG_URL = "http://10.0.0.8:8888"
|
||
|
|
OLLAMA_URL = "http://10.0.0.10:11434"
|
||
|
|
EMBED_MODEL = "nomic-embed-text"
|
||
|
|
|
||
|
|
# Social media platforms
|
||
|
|
SOCIAL_PLATFORMS = {
|
||
|
|
'x.com', 'twitter.com',
|
||
|
|
'facebook.com', 'fb.com',
|
||
|
|
'linkedin.com',
|
||
|
|
'instagram.com',
|
||
|
|
'reddit.com',
|
||
|
|
'youtube.com', 'youtu.be',
|
||
|
|
'threads.net',
|
||
|
|
'mastodon.social', 'mastodon',
|
||
|
|
'bsky.app', 'bluesky'
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def search_searxng(query: str, max_results: int = 10, category: str = 'general') -> List[Dict]:
|
||
|
|
"""Search using local SearXNG instance."""
|
||
|
|
params = {
|
||
|
|
'q': query,
|
||
|
|
'format': 'json',
|
||
|
|
'pageno': 1,
|
||
|
|
'safesearch': 0,
|
||
|
|
'language': 'en',
|
||
|
|
'category': category
|
||
|
|
}
|
||
|
|
|
||
|
|
url = f"{SEARXNG_URL}/search?{urllib.parse.urlencode(params)}"
|
||
|
|
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request(url, headers={'Accept': 'application/json'})
|
||
|
|
with urllib.request.urlopen(req, timeout=30) as response:
|
||
|
|
data = json.loads(response.read().decode('utf-8'))
|
||
|
|
return data.get('results', [])[:max_results]
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Search error: {e}", file=sys.stderr)
|
||
|
|
return []
|
||
|
|
|
||
|
|
|
||
|
|
def extract_content(url: str) -> Optional[str]:
|
||
|
|
"""Extract content from URL using Crawl4AI if available."""
|
||
|
|
try:
|
||
|
|
# Try using crawl4ai
|
||
|
|
import crawl4ai
|
||
|
|
from crawl4ai import AsyncWebCrawler
|
||
|
|
import asyncio
|
||
|
|
|
||
|
|
async def crawl():
|
||
|
|
async with AsyncWebCrawler() as crawler:
|
||
|
|
result = await crawler.arun(url=url)
|
||
|
|
return result.markdown if result else None
|
||
|
|
|
||
|
|
return asyncio.run(crawl())
|
||
|
|
except ImportError:
|
||
|
|
# Fallback to simple fetch
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request(url, headers={
|
||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0'
|
||
|
|
})
|
||
|
|
with urllib.request.urlopen(req, timeout=15) as response:
|
||
|
|
return response.read().decode('utf-8', errors='ignore')[:5000]
|
||
|
|
except Exception as e:
|
||
|
|
return f"Error fetching content: {e}"
|
||
|
|
|
||
|
|
|
||
|
|
def is_social_media(url: str) -> bool:
|
||
|
|
"""Check if URL is from a social media platform."""
|
||
|
|
url_lower = url.lower()
|
||
|
|
for platform in SOCIAL_PLATFORMS:
|
||
|
|
if platform in url_lower:
|
||
|
|
return True
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def generate_embedding(text: str) -> Optional[List[float]]:
|
||
|
|
"""Generate embedding using local Ollama."""
|
||
|
|
try:
|
||
|
|
import requests
|
||
|
|
response = requests.post(
|
||
|
|
f"{OLLAMA_URL}/api/embeddings",
|
||
|
|
json={"model": EMBED_MODEL, "prompt": text[:8192]},
|
||
|
|
timeout=60
|
||
|
|
)
|
||
|
|
if response.status_code == 200:
|
||
|
|
return response.json().get('embedding')
|
||
|
|
return None
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Embedding error: {e}", file=sys.stderr)
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def deep_search(query: str, max_urls: int = 5, social_only: bool = False) -> Dict:
|
||
|
|
"""Perform deep search with content extraction."""
|
||
|
|
results = {
|
||
|
|
'query': query,
|
||
|
|
'urls_searched': [],
|
||
|
|
'social_results': [],
|
||
|
|
'web_results': [],
|
||
|
|
'errors': []
|
||
|
|
}
|
||
|
|
|
||
|
|
# Search
|
||
|
|
search_results = search_searxng(query, max_results=max_urls * 2)
|
||
|
|
|
||
|
|
for result in search_results[:max_urls]:
|
||
|
|
url = result.get('url', '')
|
||
|
|
title = result.get('title', '')
|
||
|
|
snippet = result.get('content', '')
|
||
|
|
|
||
|
|
if not url:
|
||
|
|
continue
|
||
|
|
|
||
|
|
is_social = is_social_media(url)
|
||
|
|
|
||
|
|
if social_only and not is_social:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Extract full content
|
||
|
|
full_content = extract_content(url)
|
||
|
|
|
||
|
|
entry = {
|
||
|
|
'url': url,
|
||
|
|
'title': title,
|
||
|
|
'snippet': snippet,
|
||
|
|
'full_content': full_content[:3000] if full_content else None,
|
||
|
|
'is_social': is_social
|
||
|
|
}
|
||
|
|
|
||
|
|
if is_social:
|
||
|
|
results['social_results'].append(entry)
|
||
|
|
else:
|
||
|
|
results['web_results'].append(entry)
|
||
|
|
|
||
|
|
results['urls_searched'].append(url)
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description='Deep Search with Social Media Support')
|
||
|
|
parser.add_argument('query', help='Search query')
|
||
|
|
parser.add_argument('--social', action='store_true', help='Include social media platforms')
|
||
|
|
parser.add_argument('--social-only', action='store_true', help='Only search social media')
|
||
|
|
parser.add_argument('--max-urls', type=int, default=8, help='Maximum URLs to fetch (default: 8)')
|
||
|
|
parser.add_argument('--json', action='store_true', help='Output as JSON')
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
print(f"🔍 Deep Search: {args.query}")
|
||
|
|
print(f" Social media: {'only' if args.social_only else ('yes' if args.social else 'no')}")
|
||
|
|
print(f" Max URLs: {args.max_urls}")
|
||
|
|
print("-" * 60)
|
||
|
|
|
||
|
|
results = deep_search(args.query, max_urls=args.max_urls, social_only=args.social_only)
|
||
|
|
|
||
|
|
if args.json:
|
||
|
|
print(json.dumps(results, indent=2))
|
||
|
|
else:
|
||
|
|
# Print formatted results
|
||
|
|
if results['social_results']:
|
||
|
|
print("\n📱 SOCIAL MEDIA RESULTS:")
|
||
|
|
for r in results['social_results']:
|
||
|
|
print(f"\n 🌐 {r['url']}")
|
||
|
|
print(f" Title: {r['title']}")
|
||
|
|
print(f" Snippet: {r['snippet'][:200]}...")
|
||
|
|
|
||
|
|
if results['web_results']:
|
||
|
|
print("\n🌐 WEB RESULTS:")
|
||
|
|
for r in results['web_results']:
|
||
|
|
print(f"\n 🌐 {r['url']}")
|
||
|
|
print(f" Title: {r['title']}")
|
||
|
|
print(f" Snippet: {r['snippet'][:200]}...")
|
||
|
|
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print(f"Total URLs searched: {len(results['urls_searched'])}")
|
||
|
|
print(f"Social results: {len(results['social_results'])}")
|
||
|
|
print(f"Web results: {len(results['web_results'])}")
|
||
|
|
|
||
|
|
return 0
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
sys.exit(main())
|