#!/usr/bin/env python3
from __future__ import annotations

import json
import time
from html.parser import HTMLParser
from pathlib import Path
from typing import Any
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

try:
    import html2text
    USING_FALLBACK_HTML2TEXT = False
except ImportError:
    USING_FALLBACK_HTML2TEXT = True

    class _SimpleMarkdownParser(HTMLParser):
        def __init__(self) -> None:
            super().__init__()
            self._parts: list[str] = []
            self._skip_depth = 0

        def _write(self, text: str) -> None:
            if text:
                self._parts.append(text)

        def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
            if tag in {"script", "style", "noscript"}:
                self._skip_depth += 1
                return
            if self._skip_depth > 0:
                return
            if tag in {"br"}:
                self._write("\n")
            elif tag in {"p", "div", "section", "article", "h1", "h2", "h3", "h4", "h5", "h6"}:
                self._write("\n\n")
            elif tag == "li":
                self._write("\n- ")

        def handle_endtag(self, tag: str) -> None:
            if tag in {"script", "style", "noscript"} and self._skip_depth > 0:
                self._skip_depth -= 1
                return
            if self._skip_depth > 0:
                return
            if tag in {"p", "div", "section", "article"}:
                self._write("\n\n")

        def handle_data(self, data: str) -> None:
            if self._skip_depth == 0:
                self._write(data)

        def as_markdown(self) -> str:
            text = "".join(self._parts)
            lines = [line.rstrip() for line in text.splitlines()]
            compact: list[str] = []
            prev_blank = False
            for line in lines:
                is_blank = line.strip() == ""
                if is_blank and prev_blank:
                    continue
                compact.append(line)
                prev_blank = is_blank
            return "\n".join(compact).strip()

    class _FallbackHTML2Text:
        def __init__(self) -> None:
            self.body_width = 0
            self.ignore_images = False
            self.ignore_links = False
            self.protect_links = True

        def handle(self, html: str) -> str:
            parser = _SimpleMarkdownParser()
            parser.feed(html)
            parser.close()
            return parser.as_markdown()

    class _FallbackHtml2TextModule:
        HTML2Text = _FallbackHTML2Text

    html2text = _FallbackHtml2TextModule()  # type: ignore[assignment]


BASE_DIR = Path(__file__).resolve().parent
ARTICLES_PATH = BASE_DIR / "articles.json"
REQUEST_INTERVAL_SECONDS = 0.5
USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/122.0.0.0 Safari/537.36 "
    "OpenClawArticleCrawler/1.0"
)


def yaml_quote(value: Any) -> str:
    return json.dumps("" if value is None else str(value), ensure_ascii=False)


def load_articles(path: Path) -> list[dict[str, Any]]:
    raw = json.loads(path.read_text(encoding="utf-8"))
    if isinstance(raw, list):
        return raw
    if isinstance(raw, dict) and isinstance(raw.get("articles"), list):
        return raw["articles"]
    raise ValueError(f"Unsupported JSON format in {path}")


def is_video_article(article: dict[str, Any]) -> bool:
    source = (article.get("source") or "").lower()
    url = (article.get("sourceUrl") or "").lower()
    return any(
        keyword in url for keyword in ("bilibili.com", "youtube.com", "youtu.be")
    ) or "b站" in source or "youtube" in source


def combine_summary_content(article: dict[str, Any]) -> str:
    summary = (article.get("summary") or "").strip()
    content = (article.get("content") or "").strip()
    if summary and content:
        if content.startswith(summary):
            return content
        return f"{summary}\n\n{content}"
    return summary or content


def decode_response(raw: bytes, content_type: str) -> str:
    charset = ""
    for part in content_type.split(";"):
        part = part.strip().lower()
        if part.startswith("charset="):
            charset = part.split("=", 1)[1].strip()
            break
    for encoding in [charset, "utf-8", "gb18030", "big5", "latin-1"]:
        if not encoding:
            continue
        try:
            return raw.decode(encoding)
        except UnicodeDecodeError:
            continue
    return raw.decode("utf-8", errors="replace")


def fetch_markdown(url: str, converter: html2text.HTML2Text) -> str:
    request = Request(
        url,
        headers={
            "User-Agent": USER_AGENT,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        },
    )
    with urlopen(request, timeout=30) as response:
        raw = response.read()
        content_type = response.headers.get("Content-Type", "")
    html = decode_response(raw, content_type)
    markdown = converter.handle(html).strip()
    return markdown


def build_frontmatter(article: dict[str, Any], url: str) -> str:
    lines = [
        "---",
        f"title: {yaml_quote(article.get('title', ''))}",
        f"source: {yaml_quote(article.get('source', ''))}",
        f"url: {yaml_quote(url)}",
        f"date: {yaml_quote(article.get('date', ''))}",
        f"category: {yaml_quote(article.get('category', ''))}",
    ]
    tags = article.get("tags") or []
    if tags:
        lines.append("tags:")
        for tag in tags:
            lines.append(f"  - {yaml_quote(tag)}")
    else:
        lines.append("tags: []")
    lines.append("---")
    return "\n".join(lines)


def main() -> None:
    articles = load_articles(ARTICLES_PATH)

    if USING_FALLBACK_HTML2TEXT:
        print(
            "[WARN] html2text not installed; using built-in fallback converter. "
            "Run: python3 -m pip install html2text"
        )

    converter = html2text.HTML2Text()
    converter.body_width = 0
    converter.ignore_images = False
    converter.ignore_links = False
    converter.protect_links = True

    stats = {
        "total": len(articles),
        "written": 0,
        "has_source_url": 0,
        "no_source_url": 0,
        "video_articles": 0,
        "fetch_attempted": 0,
        "fetch_success": 0,
        "fetch_failed": 0,
    }
    failures: list[tuple[str, str, str]] = []
    last_request_time = 0.0

    for index, article in enumerate(articles, start=1):
        article_id = str(article.get("id") or f"article-{index:03d}")
        source_url = (article.get("sourceUrl") or "").strip()

        if source_url:
            stats["has_source_url"] += 1
        else:
            stats["no_source_url"] += 1

        if not source_url:
            body = (article.get("content") or "").strip()
        elif is_video_article(article):
            stats["video_articles"] += 1
            body = combine_summary_content(article)
        else:
            stats["fetch_attempted"] += 1
            elapsed = time.time() - last_request_time
            if elapsed < REQUEST_INTERVAL_SECONDS:
                time.sleep(REQUEST_INTERVAL_SECONDS - elapsed)
            last_request_time = time.time()
            try:
                body = fetch_markdown(source_url, converter)
                if not body.strip():
                    raise ValueError("Converted markdown is empty")
                stats["fetch_success"] += 1
            except (HTTPError, URLError, TimeoutError, ValueError, OSError) as exc:
                stats["fetch_failed"] += 1
                failures.append((article_id, source_url, str(exc)))
                fallback = combine_summary_content(article)
                body = f"[爬取失败]\n\n{fallback}".strip()

        if not body.strip():
            body = combine_summary_content(article) or "[内容为空]"

        frontmatter = build_frontmatter(article, source_url)
        output_path = BASE_DIR / f"{article_id}.md"
        output_path.write_text(f"{frontmatter}\n\n{body.strip()}\n", encoding="utf-8")
        stats["written"] += 1

    print("=== Crawl Stats ===")
    print(f"Total articles: {stats['total']}")
    print(f"Written markdown files: {stats['written']}")
    print(f"Articles with sourceUrl: {stats['has_source_url']}")
    print(f"Articles without sourceUrl: {stats['no_source_url']}")
    print(f"Video articles (summary+content): {stats['video_articles']}")
    print(f"Fetch attempted: {stats['fetch_attempted']}")
    print(f"Fetch success: {stats['fetch_success']}")
    print(f"Fetch failed (fallback used): {stats['fetch_failed']}")

    if failures:
        print("\nFailed article list:")
        for article_id, url, reason in failures:
            print(f"- {article_id} | {url} | {reason}")


if __name__ == "__main__":
    main()
