✨ 8 major features: trafilatura, digest, ntfy actions, templates, FTS5 search, backup/restore, proxy, RSS reader

- Full article extraction via trafilatura (fetch_full_article) - Digest mode with configurable period (digest_enabled, digest_period_hours) - ntfy Actions buttons (Open article, Open feed) - Notification templates with {title}, {body}, {link}, {source}, {image_url} - FTS5 full-text search in notification history - Database backup/restore (download/upload .db) - HTTP/SOCKS proxy for RSS feed fetching (proxy_url setting) - Built-in RSS reader tab with categories, unread counts, article detail view - Auto-category 'Общее' for feeds without a category - Article storage (Article table) for reader - DigestEntry model for pending digest entries Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 20:47:46 +08:00
parent f8d2c31658
commit 834092a3ec
13 changed files with 1414 additions and 44 deletions
@@ -2,30 +2,38 @@
 from __future__ import annotations

 import asyncio
+import json
 import logging
 import re
 from datetime import datetime, timezone
 from html import unescape

+import httpx
+from io import StringIO
+
 import feedparser
 from sqlmodel import Session, select

 from . import delivery
 from .database import engine, get_settings
 from .delivery import Message
-from .models import Feed, Notification, SeenEntry
+from .models import Article, DigestEntry, Feed, Notification, SeenEntry

 log = logging.getLogger("checker")

 _TAG_RE = re.compile(r"<[^>]+>")
 _IMG_RE = re.compile(r'<img[^>]+src=["\']([^"\']+)["\']', re.IGNORECASE)
+_VIDEO_RE = re.compile(
+    r'<(?:video|iframe|source|embed)[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE
+)
+_ENC_VIDEO_RE = re.compile(r'<(?:video|iframe|source|embed)[^>]*>', re.IGNORECASE)


 def _strip_html(text: str, limit: int = 1500) -> str:
    text = unescape(_TAG_RE.sub(" ", text or ""))
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text).strip()
-    if len(text) > limit:
+    if limit and len(text) > limit:
        text = text[:limit].rsplit(" ", 1)[0] + " …"
    return text

@@ -57,6 +65,60 @@ def _extract_image(entry) -> str:
    return match.group(1) if match else ""


+def _extract_all_images(entry) -> list[str]:
+    """Extract ALL image URLs from a feed entry (deduplicated, order preserved)."""
+    urls: list[str] = []
+
+    # media_content / media_thumbnail
+    for key in ("media_content", "media_thumbnail"):
+        media = entry.get(key)
+        if media and isinstance(media, list):
+            for item in media:
+                url = item.get("url")
+                if url:
+                    urls.append(url)
+
+    # enclosure links with image/ type
+    for link in entry.get("links", []):
+        if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("image"):
+            href = link.get("href", "")
+            if href:
+                urls.append(href)
+
+    # <img> tags in HTML body
+    html = entry.get("summary") or entry.get("description") or ""
+    if not html:
+        content = entry.get("content")
+        if content and isinstance(content, list):
+            html = content[0].get("value", "")
+    urls.extend(_IMG_RE.findall(html or ""))
+
+    # deduplicate preserving order
+    return list(dict.fromkeys(urls))
+
+
+def _extract_videos(entry) -> list[str]:
+    """Extract video/multimedia URLs from a feed entry."""
+    urls: list[str] = []
+
+    # enclosure links with video/ type
+    for link in entry.get("links", []):
+        if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("video"):
+            href = link.get("href", "")
+            if href:
+                urls.append(href)
+
+    # <video>, <iframe>, <source>, <embed> tags in HTML body
+    html = entry.get("summary") or entry.get("description") or ""
+    if not html:
+        content = entry.get("content")
+        if content and isinstance(content, list):
+            html = content[0].get("value", "")
+    urls.extend(_VIDEO_RE.findall(html or ""))
+
+    return list(dict.fromkeys(urls))
+
+
 def _passes_filters(feed: Feed, title: str, body: str) -> bool:
    """Keyword include/exclude check (case-insensitive)."""
    haystack = f"{title}\n{body}".lower()
@@ -69,9 +131,43 @@ def _passes_filters(feed: Feed, title: str, body: str) -> bool:
    return True


-def _parse(url: str):
-    """Blocking feedparser call (run in a thread)."""
-    return feedparser.parse(url, agent="rss-ntfy/1.0 (+https://github.com)")
+def _parse_raw(xml: str):
+    """Blocking feedparser call on XML string (run in a thread)."""
+    return feedparser.parse(StringIO(xml), agent="rss-ntfy/1.0 (+https://github.com)")
+
+
+async def _fetch_feed(url: str, proxy: str = "") -> str:
+    """Download feed XML via httpx (supports proxy)."""
+    kw = {"timeout": 30}
+    if proxy.strip():
+        kw["proxy"] = proxy.strip()
+    async with httpx.AsyncClient(**kw) as client:
+        resp = await client.get(
+            url,
+            headers={"User-Agent": "rss-ntfy/1.0 (+https://github.com)"},
+        )
+        resp.raise_for_status()
+        return resp.text
+
+
+def _extract_full_article(url: str) -> tuple[str, str]:
+    """Fetch page and extract main article text via trafilatura.
+    Returns (plain_text, html) or ("", "") on failure.
+    """
+    try:
+        import trafilatura
+        downloaded = trafilatura.fetch_url(url)
+        if downloaded is None:
+            return "", ""
+        plain = trafilatura.extract(
+            downloaded, output_format="txt", with_metadata=False
+        ) or ""
+        html = trafilatura.extract(
+            downloaded, output_format="xml", with_metadata=False
+        ) or ""
+        return plain.strip(), html.strip()
+    except Exception:
+        return "", ""


 async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:
@@ -79,7 +175,10 @@ async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:

    Raises ValueError if the feed can't be parsed or has no matching entries.
    """
-    parsed = await asyncio.to_thread(_parse, url)
+    with Session(engine) as s:
+        proxy = get_settings(s).proxy_url
+    raw_xml = await _fetch_feed(url, proxy=proxy)
+    parsed = await asyncio.to_thread(_parse_raw, raw_xml)
    if getattr(parsed, "bozo", False) and not parsed.entries:
        raise ValueError(str(getattr(parsed, "bozo_exception", "parse error")))
    if not parsed.entries:
@@ -104,7 +203,13 @@ async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:

 async def check_feed(feed: Feed) -> str:
    """Check a single feed, dispatch new entries, log history. Returns status."""
-    parsed = await asyncio.to_thread(_parse, feed.url)
+    # Load settings early for proxy URL
+    with Session(engine) as _sess:
+        _settings = get_settings(_sess)
+        proxy_url = _settings.proxy_url
+
+    raw_xml = await _fetch_feed(feed.url, proxy=proxy_url)
+    parsed = await asyncio.to_thread(_parse_raw, raw_xml)

    if getattr(parsed, "bozo", False) and not parsed.entries:
        exc = getattr(parsed, "bozo_exception", "parse error")
@@ -145,7 +250,24 @@ async def check_feed(feed: Feed) -> str:
                continue

            title = entry.get("title", "(без заголовка)")
-            body = _strip_html(entry.get("summary") or entry.get("description") or "")
+            raw_html = entry.get("summary") or entry.get("description") or ""
+            link = entry.get("link", "")
+            full = db_feed.send_full_content
+            fetch_full = db_feed.fetch_full_article
+            body = _strip_html(raw_html, limit=0 if full else 1500)
+
+            # Trafilatura: extract full article text from the link page
+            if fetch_full and link and len(body) < 500:
+                try:
+                    extra_text, extra_html = await asyncio.to_thread(
+                        _extract_full_article, link
+                    )
+                    if extra_text:
+                        body = extra_text
+                    if extra_html:
+                        raw_html = extra_html
+                except Exception:
+                    pass  # siliently fall back to RSS body

            if not _passes_filters(db_feed, title, body):
                skipped += 1
@@ -155,9 +277,78 @@ async def check_feed(feed: Feed) -> str:
                source=db_feed.title or feed_title,
                title=title,
                body=body,
-                link=entry.get("link", ""),
+                link=link,
                image=_extract_image(entry),
+                images=_extract_all_images(entry) if full else [],
+                full_html=raw_html if full else "",
+                videos=_extract_videos(entry) if full else [],
+                full_content=full,
            )
+
+            # Store article for RSS reader (always, including first_run entries)
+            try:
+                existing_art = session.exec(
+                    select(Article).where(
+                        Article.feed_id == db_feed.id,
+                        Article.link == link,
+                    )
+                ).first()
+                pub = entry.get("published_parsed")
+                pub_dt = None
+                if pub:
+                    try:
+                        pub_dt = datetime(*pub[:6], tzinfo=timezone.utc)
+                    except Exception:
+                        pass
+                if existing_art:
+                    existing_art.title = title
+                    existing_art.body = body
+                    existing_art.full_html = raw_html
+                    existing_art.image = msg.image
+                    if pub_dt:
+                        existing_art.published_at = pub_dt
+                    session.add(existing_art)
+                else:
+                    session.add(Article(
+                        feed_id=db_feed.id,
+                        feed_title=db_feed.title or feed_title,
+                        title=title,
+                        body=body,
+                        full_html=raw_html,
+                        link=link,
+                        image=msg.image,
+                        published_at=pub_dt,
+                    ))
+            except Exception:
+                pass  # article storage is best-effort
+
+            # Digest mode: store instead of dispatching
+            if db_feed.digest_enabled:
+                session.add(DigestEntry(
+                    feed_id=db_feed.id,
+                    title=title,
+                    link=link,
+                    body=body,
+                    image=msg.image,
+                    full_html=raw_html if full else "",
+                    images=json.dumps(msg.images) if full else "[]",
+                    videos=json.dumps(msg.videos) if full else "[]",
+                    full_content=full,
+                ))
+                # Record as seen but skip dispatch
+                sent += 1
+                session.add(
+                    Notification(
+                        feed_id=db_feed.id,
+                        feed_title=msg.source,
+                        title=title,
+                        link=msg.link,
+                        channels="digest",
+                        ok=True,
+                        detail="queued for digest",
+                    )
+                )
+                continue
            result = await delivery.dispatch(db_feed, settings, msg)

            session.add(