#!/usr/bin/env python3
"""Crawl all 42 individual skill guide pages from sanwan.ai/skills/"""

import urllib.request
import time
import os
import re

try:
    import html2text
    h = html2text.HTML2Text()
    h.ignore_links = False
    h.ignore_images = False
    h.body_width = 0
    HAS_H2T = True
except ImportError:
    HAS_H2T = False

SKILLS = [
    "multi-agent-setup", "web-search", "deep-research", "weather", "email",
    "feishu-doc", "feishu-cal", "feishu-cards", "image-gen", "voice-gen",
    "coding", "github-ops", "browser-vision", "humanizer", "wechat",
    "twitter", "xiaohongshu", "web-scraper", "pdf-gen", "train-tickets",
    "flight-search", "stock-monitor", "seo-writer", "blog-writer",
    "intelligence", "video-summary", "trending", "hn-digest",
    "stock-analysis", "hk-stock", "project-manager", "fusheng-voice",
    "n8n-automation", "distil-web", "self-evolution", "security-audit",
    "summary-report", "competitor-research", "clawhub", "video-gen",
    "music-gen", "linkedin"
]

BASE_URL = "https://sanwan.ai/skills/"
OUT_DIR = os.path.join(os.path.dirname(__file__), "skills")
os.makedirs(OUT_DIR, exist_ok=True)

success = 0
failed = 0

for skill in SKILLS:
    url = f"{BASE_URL}{skill}.html"
    out_path = os.path.join(OUT_DIR, f"{skill}.md")

    if os.path.exists(out_path):
        print(f"[SKIP] {skill} — already exists")
        success += 1
        continue

    try:
        req = urllib.request.Request(url, headers={
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
        })
        with urllib.request.urlopen(req, timeout=15) as resp:
            html = resp.read().decode("utf-8", errors="replace")

        if HAS_H2T:
            md = h.handle(html)
        else:
            # Basic fallback: strip tags
            md = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S)
            md = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.S)
            md = re.sub(r'<[^>]+>', '', html)
            md = re.sub(r'\n{3,}', '\n\n', md).strip()

        header = f"---\nsource: {url}\ncrawled: 2026-03-06\n---\n\n"

        with open(out_path, "w", encoding="utf-8") as f:
            f.write(header + md)

        print(f"[OK]   {skill} — {len(md)} chars")
        success += 1
    except Exception as e:
        print(f"[FAIL] {skill} — {e}")
        failed += 1

    time.sleep(0.3)

print(f"\nDone: {success} success, {failed} failed out of {len(SKILLS)} total")