Initial commit: workspace setup with skills, memory, config

This commit is contained in:
root
2026-02-10 14:37:49 -06:00
commit d1357c5463
77 changed files with 10822 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
# deep-search Skill
Deep web search with social media support using SearXNG + Crawl4AI.
## Usage
```bash
python3 deep_search.py 'your search query'
python3 deep_search.py --social 'your search query'
python3 deep_search.py --social --max-urls 8 'query'
```
## Features
- Web search via local SearXNG (http://10.0.0.8:8888)
- Social media search: x.com, facebook, linkedin, instagram, reddit, youtube, threads, mastodon, bluesky
- Content extraction via Crawl4AI
- Local embedding with nomic-embed-text via Ollama
## Requirements
- SearXNG running at http://10.0.0.8:8888
- crawl4ai installed (`pip install crawl4ai`)
- Ollama with nomic-embed-text model

View File

@@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""
Deep Search with Social Media Support
Uses SearXNG + Crawl4AI for comprehensive web and social media search.
"""
import argparse
import json
import sys
import urllib.parse
import urllib.request
from typing import List, Dict, Optional
import subprocess
import os
# Configuration
SEARXNG_URL = "http://10.0.0.8:8888"
OLLAMA_URL = "http://10.0.0.10:11434"
EMBED_MODEL = "nomic-embed-text"
# Social media platforms
SOCIAL_PLATFORMS = {
'x.com', 'twitter.com',
'facebook.com', 'fb.com',
'linkedin.com',
'instagram.com',
'reddit.com',
'youtube.com', 'youtu.be',
'threads.net',
'mastodon.social', 'mastodon',
'bsky.app', 'bluesky'
}
def search_searxng(query: str, max_results: int = 10, category: str = 'general') -> List[Dict]:
"""Search using local SearXNG instance."""
params = {
'q': query,
'format': 'json',
'pageno': 1,
'safesearch': 0,
'language': 'en',
'category': category
}
url = f"{SEARXNG_URL}/search?{urllib.parse.urlencode(params)}"
try:
req = urllib.request.Request(url, headers={'Accept': 'application/json'})
with urllib.request.urlopen(req, timeout=30) as response:
data = json.loads(response.read().decode('utf-8'))
return data.get('results', [])[:max_results]
except Exception as e:
print(f"Search error: {e}", file=sys.stderr)
return []
def extract_content(url: str) -> Optional[str]:
"""Extract content from URL using Crawl4AI if available."""
try:
# Try using crawl4ai
import crawl4ai
from crawl4ai import AsyncWebCrawler
import asyncio
async def crawl():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url)
return result.markdown if result else None
return asyncio.run(crawl())
except ImportError:
# Fallback to simple fetch
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0'
})
with urllib.request.urlopen(req, timeout=15) as response:
return response.read().decode('utf-8', errors='ignore')[:5000]
except Exception as e:
return f"Error fetching content: {e}"
def is_social_media(url: str) -> bool:
"""Check if URL is from a social media platform."""
url_lower = url.lower()
for platform in SOCIAL_PLATFORMS:
if platform in url_lower:
return True
return False
def generate_embedding(text: str) -> Optional[List[float]]:
"""Generate embedding using local Ollama."""
try:
import requests
response = requests.post(
f"{OLLAMA_URL}/api/embeddings",
json={"model": EMBED_MODEL, "prompt": text[:8192]},
timeout=60
)
if response.status_code == 200:
return response.json().get('embedding')
return None
except Exception as e:
print(f"Embedding error: {e}", file=sys.stderr)
return None
def deep_search(query: str, max_urls: int = 5, social_only: bool = False) -> Dict:
"""Perform deep search with content extraction."""
results = {
'query': query,
'urls_searched': [],
'social_results': [],
'web_results': [],
'errors': []
}
# Search
search_results = search_searxng(query, max_results=max_urls * 2)
for result in search_results[:max_urls]:
url = result.get('url', '')
title = result.get('title', '')
snippet = result.get('content', '')
if not url:
continue
is_social = is_social_media(url)
if social_only and not is_social:
continue
# Extract full content
full_content = extract_content(url)
entry = {
'url': url,
'title': title,
'snippet': snippet,
'full_content': full_content[:3000] if full_content else None,
'is_social': is_social
}
if is_social:
results['social_results'].append(entry)
else:
results['web_results'].append(entry)
results['urls_searched'].append(url)
return results
def main():
parser = argparse.ArgumentParser(description='Deep Search with Social Media Support')
parser.add_argument('query', help='Search query')
parser.add_argument('--social', action='store_true', help='Include social media platforms')
parser.add_argument('--social-only', action='store_true', help='Only search social media')
parser.add_argument('--max-urls', type=int, default=8, help='Maximum URLs to fetch (default: 8)')
parser.add_argument('--json', action='store_true', help='Output as JSON')
args = parser.parse_args()
print(f"🔍 Deep Search: {args.query}")
print(f" Social media: {'only' if args.social_only else ('yes' if args.social else 'no')}")
print(f" Max URLs: {args.max_urls}")
print("-" * 60)
results = deep_search(args.query, max_urls=args.max_urls, social_only=args.social_only)
if args.json:
print(json.dumps(results, indent=2))
else:
# Print formatted results
if results['social_results']:
print("\n📱 SOCIAL MEDIA RESULTS:")
for r in results['social_results']:
print(f"\n 🌐 {r['url']}")
print(f" Title: {r['title']}")
print(f" Snippet: {r['snippet'][:200]}...")
if results['web_results']:
print("\n🌐 WEB RESULTS:")
for r in results['web_results']:
print(f"\n 🌐 {r['url']}")
print(f" Title: {r['title']}")
print(f" Snippet: {r['snippet'][:200]}...")
print(f"\n{'='*60}")
print(f"Total URLs searched: {len(results['urls_searched'])}")
print(f"Social results: {len(results['social_results'])}")
print(f"Web results: {len(results['web_results'])}")
return 0
if __name__ == '__main__':
sys.exit(main())