feat: 日常增量 - 小红书配图/舆情记录/日报/草稿归档

2026-04-23 03:46:43 +00:00
parent 289878b05a
commit e0efcd0582
35 changed files with 1795 additions and 22 deletions
--- a/skills/feed-to-md/scripts/feed_to_md.py
+++ b/skills/feed-to-md/scripts/feed_to_md.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""Convert RSS/Atom feeds to Markdown with safe URL/path handling."""
+
+from __future__ import annotations
+
+import argparse
+import html
+import ipaddress
+import pathlib
+import re
+import socket
+import sys
+import urllib.parse
+import urllib.request
+import xml.etree.ElementTree as ET
+
+TAG_RE = re.compile(r"<[^>]+>")
+
+
+def normalize_text(value: str) -> str:
+    text = html.unescape(value or "")
+    text = TAG_RE.sub("", text)
+    return " ".join(text.split()).strip()
+
+
+def validate_public_hostname(hostname: str, label: str) -> None:
+    if hostname in {"localhost", "localhost.localdomain"}:
+        raise ValueError(f"{label} uses localhost, which is not allowed")
+
+    try:
+        addr_info = socket.getaddrinfo(hostname, None)
+    except socket.gaierror as exc:
+        raise ValueError(f"Unable to resolve host: {hostname}") from exc
+
+    for item in addr_info:
+        ip_raw = item[4][0]
+        ip = ipaddress.ip_address(ip_raw)
+        if (
+            ip.is_private
+            or ip.is_loopback
+            or ip.is_link_local
+            or ip.is_multicast
+            or ip.is_reserved
+            or ip.is_unspecified
+        ):
+            raise ValueError(f"{label} resolves to a non-public IP address")
+
+
+def validate_feed_url(raw_url: str, label: str = "Feed URL") -> str:
+    parsed = urllib.parse.urlparse(raw_url)
+    if parsed.scheme not in {"http", "https"}:
+        raise ValueError(f"{label} must use http or https")
+    if not parsed.hostname:
+        raise ValueError(f"{label} must include a hostname")
+
+    hostname = parsed.hostname.strip().lower()
+    validate_public_hostname(hostname, f"{label} host")
+
+    return parsed.geturl()
+
+
+def validate_output_path(raw_path: str) -> pathlib.Path:
+    out_path = pathlib.Path(raw_path)
+    if out_path.is_absolute():
+        raise ValueError("Output path must be relative to the current workspace")
+    if ".." in out_path.parts:
+        raise ValueError("Output path must not contain '..'")
+    if out_path.suffix.lower() != ".md":
+        raise ValueError("Output path must end with .md")
+
+    root = pathlib.Path.cwd().resolve()
+    target = (root / out_path).resolve()
+    try:
+        target.relative_to(root)
+    except ValueError as exc:
+        raise ValueError("Output path escapes the current workspace") from exc
+    return target
+
+
+class PublicOnlyRedirectHandler(urllib.request.HTTPRedirectHandler):
+    def redirect_request(self, req, fp, code, msg, headers, newurl):  # noqa: D401
+        redirected_url = urllib.parse.urljoin(req.full_url, newurl)
+        validate_feed_url(redirected_url, "Redirect URL")
+        return super().redirect_request(req, fp, code, msg, headers, newurl)
+
+
+def fetch_xml(url: str, timeout: int = 15) -> bytes:
+    request = urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, */*",
+            "Accept-Language": "en-US,en;q=0.9",
+        },
+    )
+    opener = urllib.request.build_opener(PublicOnlyRedirectHandler())
+    with opener.open(request, timeout=timeout) as response:
+        final_url = response.geturl()
+        validate_feed_url(final_url, "Final URL")
+        return response.read()
+
+
+def namespace(tag: str) -> str | None:
+    if tag.startswith("{") and "}" in tag:
+        return tag[1:].split("}", 1)[0]
+    return None
+
+
+def find_text(elem: ET.Element, path: str, ns: dict[str, str] | None = None) -> str:
+    child = elem.find(path, ns or {})
+    if child is None or child.text is None:
+        return ""
+    return normalize_text(child.text)
+
+
+def parse_rss(root: ET.Element) -> tuple[str, list[dict[str, str]]]:
+    content_ns = {"content": "http://purl.org/rss/1.0/modules/content/"}
+    channel = root.find("channel")
+    if channel is None:
+        raise ValueError("Invalid RSS feed: missing channel")
+
+    feed_title = find_text(channel, "title") or "Feed"
+    entries: list[dict[str, str]] = []
+    for item in channel.findall("item"):
+        title = find_text(item, "title") or "Untitled"
+        link = find_text(item, "link")
+        summary = find_text(item, "content:encoded", content_ns) or find_text(
+            item, "description"
+        )
+        published = find_text(item, "pubDate")
+        entries.append(
+            {
+                "title": title,
+                "link": link,
+                "summary": summary,
+                "published": published,
+            }
+        )
+    return feed_title, entries
+
+
+def parse_atom(root: ET.Element, atom_ns: str) -> tuple[str, list[dict[str, str]]]:
+    ns = {"a": atom_ns}
+    feed_title = find_text(root, "a:title", ns) or "Feed"
+    entries: list[dict[str, str]] = []
+
+    for entry in root.findall("a:entry", ns):
+        title = find_text(entry, "a:title", ns) or "Untitled"
+        summary = find_text(entry, "a:summary", ns) or find_text(entry, "a:content", ns)
+        published = find_text(entry, "a:updated", ns) or find_text(entry, "a:published", ns)
+
+        link = ""
+        for link_elem in entry.findall("a:link", ns):
+            href = (link_elem.attrib.get("href") or "").strip()
+            rel = (link_elem.attrib.get("rel") or "alternate").strip()
+            if not href:
+                continue
+            if rel == "alternate":
+                link = href
+                break
+            if not link:
+                link = href
+
+        entries.append(
+            {
+                "title": title,
+                "link": link,
+                "summary": summary,
+                "published": published,
+            }
+        )
+
+    return feed_title, entries
+
+
+def parse_feed(xml_bytes: bytes) -> tuple[str, list[dict[str, str]]]:
+    root = ET.fromstring(xml_bytes)
+    atom_ns = namespace(root.tag)
+    if atom_ns == "http://www.w3.org/2005/Atom":
+        return parse_atom(root, atom_ns)
+    return parse_rss(root)
+
+
+def truncate(value: str, max_len: int) -> str:
+    if max_len <= 0 or len(value) <= max_len:
+        return value
+    clipped = value[: max_len - 1].rstrip()
+    return f"{clipped}…"
+
+
+def render_markdown(
+    feed_title: str,
+    entries: list[dict[str, str]],
+    template: str,
+    include_summary: bool,
+    summary_max_len: int,
+) -> str:
+    lines: list[str] = [f"# {feed_title}", ""]
+
+    if not entries:
+        lines.extend(["No feed items found.", ""])
+        return "\n".join(lines).rstrip() + "\n"
+
+    if template == "short":
+        for item in entries:
+            title = item["title"]
+            link = item["link"]
+            published = item["published"]
+            line = f"- [{title}]({link})" if link else f"- {title}"
+            if published:
+                line += f" ({published})"
+            lines.append(line)
+        lines.append("")
+        return "\n".join(lines)
+
+    for item in entries:
+        title = item["title"]
+        link = item["link"]
+        summary = truncate(item["summary"], summary_max_len)
+        published = item["published"]
+
+        lines.append(f"## [{title}]({link})" if link else f"## {title}")
+        if published:
+            lines.append(f"- Published: {published}")
+        if include_summary and summary:
+            lines.append("")
+            lines.append(summary)
+        lines.append("")
+
+    return "\n".join(lines).rstrip() + "\n"
+
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Convert RSS/Atom feed URL to Markdown")
+    parser.add_argument("url", help="RSS/Atom feed URL")
+    parser.add_argument("-o", "--output", help="Write Markdown output to a .md file")
+    parser.add_argument("--limit", type=int, default=0, help="Max number of feed items")
+    parser.add_argument("--no-summary", action="store_true", help="Exclude summaries")
+    parser.add_argument(
+        "--summary-max-length",
+        type=int,
+        default=280,
+        help="Max summary length before truncation",
+    )
+    parser.add_argument(
+        "--template",
+        choices=("short", "full"),
+        default="short",
+        help="Output template style",
+    )
+    return parser
+
+
+def main() -> int:
+    args = build_arg_parser().parse_args()
+    try:
+        feed_url = validate_feed_url(args.url)
+        output_path = validate_output_path(args.output) if args.output else None
+        if args.limit < 0:
+            raise ValueError("--limit must be >= 0")
+        if args.summary_max_length < 0:
+            raise ValueError("--summary-max-length must be >= 0")
+
+        xml_bytes = fetch_xml(feed_url)
+        feed_title, entries = parse_feed(xml_bytes)
+        if args.limit:
+            entries = entries[: args.limit]
+
+        include_summary = (not args.no_summary) and args.template == "full"
+        markdown = render_markdown(
+            feed_title=feed_title,
+            entries=entries,
+            template=args.template,
+            include_summary=include_summary,
+            summary_max_len=args.summary_max_length,
+        )
+
+        if output_path:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            output_path.write_text(markdown, encoding="utf-8")
+        else:
+            sys.stdout.write(markdown)
+        return 0
+    except Exception as exc:  # noqa: BLE001
+        sys.stderr.write(f"error: {exc}\n")
+        return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())