Initial commit: workspace setup with skills, memory, config
This commit is contained in:
24
skills/deep-search/SKILL.md
Normal file
24
skills/deep-search/SKILL.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# deep-search Skill
|
||||
|
||||
Deep web search with social media support using SearXNG + Crawl4AI.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
python3 deep_search.py 'your search query'
|
||||
python3 deep_search.py --social 'your search query'
|
||||
python3 deep_search.py --social --max-urls 8 'query'
|
||||
```
|
||||
|
||||
## Features
|
||||
|
||||
- Web search via local SearXNG (http://10.0.0.8:8888)
|
||||
- Social media search: x.com, facebook, linkedin, instagram, reddit, youtube, threads, mastodon, bluesky
|
||||
- Content extraction via Crawl4AI
|
||||
- Local embedding with nomic-embed-text via Ollama
|
||||
|
||||
## Requirements
|
||||
|
||||
- SearXNG running at http://10.0.0.8:8888
|
||||
- crawl4ai installed (`pip install crawl4ai`)
|
||||
- Ollama with nomic-embed-text model
|
||||
201
skills/deep-search/scripts/deep_search.py
Executable file
201
skills/deep-search/scripts/deep_search.py
Executable file
@@ -0,0 +1,201 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Deep Search with Social Media Support
|
||||
Uses SearXNG + Crawl4AI for comprehensive web and social media search.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import List, Dict, Optional
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
# Configuration
|
||||
SEARXNG_URL = "http://10.0.0.8:8888"
|
||||
OLLAMA_URL = "http://10.0.0.10:11434"
|
||||
EMBED_MODEL = "nomic-embed-text"
|
||||
|
||||
# Social media platforms
|
||||
SOCIAL_PLATFORMS = {
|
||||
'x.com', 'twitter.com',
|
||||
'facebook.com', 'fb.com',
|
||||
'linkedin.com',
|
||||
'instagram.com',
|
||||
'reddit.com',
|
||||
'youtube.com', 'youtu.be',
|
||||
'threads.net',
|
||||
'mastodon.social', 'mastodon',
|
||||
'bsky.app', 'bluesky'
|
||||
}
|
||||
|
||||
|
||||
def search_searxng(query: str, max_results: int = 10, category: str = 'general') -> List[Dict]:
|
||||
"""Search using local SearXNG instance."""
|
||||
params = {
|
||||
'q': query,
|
||||
'format': 'json',
|
||||
'pageno': 1,
|
||||
'safesearch': 0,
|
||||
'language': 'en',
|
||||
'category': category
|
||||
}
|
||||
|
||||
url = f"{SEARXNG_URL}/search?{urllib.parse.urlencode(params)}"
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={'Accept': 'application/json'})
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
data = json.loads(response.read().decode('utf-8'))
|
||||
return data.get('results', [])[:max_results]
|
||||
except Exception as e:
|
||||
print(f"Search error: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def extract_content(url: str) -> Optional[str]:
|
||||
"""Extract content from URL using Crawl4AI if available."""
|
||||
try:
|
||||
# Try using crawl4ai
|
||||
import crawl4ai
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
import asyncio
|
||||
|
||||
async def crawl():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=url)
|
||||
return result.markdown if result else None
|
||||
|
||||
return asyncio.run(crawl())
|
||||
except ImportError:
|
||||
# Fallback to simple fetch
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=15) as response:
|
||||
return response.read().decode('utf-8', errors='ignore')[:5000]
|
||||
except Exception as e:
|
||||
return f"Error fetching content: {e}"
|
||||
|
||||
|
||||
def is_social_media(url: str) -> bool:
|
||||
"""Check if URL is from a social media platform."""
|
||||
url_lower = url.lower()
|
||||
for platform in SOCIAL_PLATFORMS:
|
||||
if platform in url_lower:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def generate_embedding(text: str) -> Optional[List[float]]:
|
||||
"""Generate embedding using local Ollama."""
|
||||
try:
|
||||
import requests
|
||||
response = requests.post(
|
||||
f"{OLLAMA_URL}/api/embeddings",
|
||||
json={"model": EMBED_MODEL, "prompt": text[:8192]},
|
||||
timeout=60
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json().get('embedding')
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Embedding error: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def deep_search(query: str, max_urls: int = 5, social_only: bool = False) -> Dict:
|
||||
"""Perform deep search with content extraction."""
|
||||
results = {
|
||||
'query': query,
|
||||
'urls_searched': [],
|
||||
'social_results': [],
|
||||
'web_results': [],
|
||||
'errors': []
|
||||
}
|
||||
|
||||
# Search
|
||||
search_results = search_searxng(query, max_results=max_urls * 2)
|
||||
|
||||
for result in search_results[:max_urls]:
|
||||
url = result.get('url', '')
|
||||
title = result.get('title', '')
|
||||
snippet = result.get('content', '')
|
||||
|
||||
if not url:
|
||||
continue
|
||||
|
||||
is_social = is_social_media(url)
|
||||
|
||||
if social_only and not is_social:
|
||||
continue
|
||||
|
||||
# Extract full content
|
||||
full_content = extract_content(url)
|
||||
|
||||
entry = {
|
||||
'url': url,
|
||||
'title': title,
|
||||
'snippet': snippet,
|
||||
'full_content': full_content[:3000] if full_content else None,
|
||||
'is_social': is_social
|
||||
}
|
||||
|
||||
if is_social:
|
||||
results['social_results'].append(entry)
|
||||
else:
|
||||
results['web_results'].append(entry)
|
||||
|
||||
results['urls_searched'].append(url)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Deep Search with Social Media Support')
|
||||
parser.add_argument('query', help='Search query')
|
||||
parser.add_argument('--social', action='store_true', help='Include social media platforms')
|
||||
parser.add_argument('--social-only', action='store_true', help='Only search social media')
|
||||
parser.add_argument('--max-urls', type=int, default=8, help='Maximum URLs to fetch (default: 8)')
|
||||
parser.add_argument('--json', action='store_true', help='Output as JSON')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"🔍 Deep Search: {args.query}")
|
||||
print(f" Social media: {'only' if args.social_only else ('yes' if args.social else 'no')}")
|
||||
print(f" Max URLs: {args.max_urls}")
|
||||
print("-" * 60)
|
||||
|
||||
results = deep_search(args.query, max_urls=args.max_urls, social_only=args.social_only)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(results, indent=2))
|
||||
else:
|
||||
# Print formatted results
|
||||
if results['social_results']:
|
||||
print("\n📱 SOCIAL MEDIA RESULTS:")
|
||||
for r in results['social_results']:
|
||||
print(f"\n 🌐 {r['url']}")
|
||||
print(f" Title: {r['title']}")
|
||||
print(f" Snippet: {r['snippet'][:200]}...")
|
||||
|
||||
if results['web_results']:
|
||||
print("\n🌐 WEB RESULTS:")
|
||||
for r in results['web_results']:
|
||||
print(f"\n 🌐 {r['url']}")
|
||||
print(f" Title: {r['title']}")
|
||||
print(f" Snippet: {r['snippet'][:200]}...")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Total URLs searched: {len(results['urls_searched'])}")
|
||||
print(f"Social results: {len(results['social_results'])}")
|
||||
print(f"Web results: {len(results['web_results'])}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user