8 major features: trafilatura, digest, ntfy actions, templates, FTS5 search, backup/restore, proxy, RSS reader
build-and-push / docker (push) Has been cancelled

- Full article extraction via trafilatura (fetch_full_article)
- Digest mode with configurable period (digest_enabled, digest_period_hours)
- ntfy Actions buttons (Open article, Open feed)
- Notification templates with {title}, {body}, {link}, {source}, {image_url}
- FTS5 full-text search in notification history
- Database backup/restore (download/upload .db)
- HTTP/SOCKS proxy for RSS feed fetching (proxy_url setting)
- Built-in RSS reader tab with categories, unread counts, article detail view
- Auto-category 'Общее' for feeds without a category
- Article storage (Article table) for reader
- DigestEntry model for pending digest entries

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
dimon
2026-06-03 20:47:46 +08:00
parent f8d2c31658
commit 834092a3ec
13 changed files with 1414 additions and 44 deletions
+200 -9
View File
@@ -2,30 +2,38 @@
from __future__ import annotations
import asyncio
import json
import logging
import re
from datetime import datetime, timezone
from html import unescape
import httpx
from io import StringIO
import feedparser
from sqlmodel import Session, select
from . import delivery
from .database import engine, get_settings
from .delivery import Message
from .models import Feed, Notification, SeenEntry
from .models import Article, DigestEntry, Feed, Notification, SeenEntry
log = logging.getLogger("checker")
_TAG_RE = re.compile(r"<[^>]+>")
_IMG_RE = re.compile(r'<img[^>]+src=["\']([^"\']+)["\']', re.IGNORECASE)
_VIDEO_RE = re.compile(
r'<(?:video|iframe|source|embed)[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE
)
_ENC_VIDEO_RE = re.compile(r'<(?:video|iframe|source|embed)[^>]*>', re.IGNORECASE)
def _strip_html(text: str, limit: int = 1500) -> str:
text = unescape(_TAG_RE.sub(" ", text or ""))
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text).strip()
if len(text) > limit:
if limit and len(text) > limit:
text = text[:limit].rsplit(" ", 1)[0] + ""
return text
@@ -57,6 +65,60 @@ def _extract_image(entry) -> str:
return match.group(1) if match else ""
def _extract_all_images(entry) -> list[str]:
"""Extract ALL image URLs from a feed entry (deduplicated, order preserved)."""
urls: list[str] = []
# media_content / media_thumbnail
for key in ("media_content", "media_thumbnail"):
media = entry.get(key)
if media and isinstance(media, list):
for item in media:
url = item.get("url")
if url:
urls.append(url)
# enclosure links with image/ type
for link in entry.get("links", []):
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("image"):
href = link.get("href", "")
if href:
urls.append(href)
# <img> tags in HTML body
html = entry.get("summary") or entry.get("description") or ""
if not html:
content = entry.get("content")
if content and isinstance(content, list):
html = content[0].get("value", "")
urls.extend(_IMG_RE.findall(html or ""))
# deduplicate preserving order
return list(dict.fromkeys(urls))
def _extract_videos(entry) -> list[str]:
"""Extract video/multimedia URLs from a feed entry."""
urls: list[str] = []
# enclosure links with video/ type
for link in entry.get("links", []):
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("video"):
href = link.get("href", "")
if href:
urls.append(href)
# <video>, <iframe>, <source>, <embed> tags in HTML body
html = entry.get("summary") or entry.get("description") or ""
if not html:
content = entry.get("content")
if content and isinstance(content, list):
html = content[0].get("value", "")
urls.extend(_VIDEO_RE.findall(html or ""))
return list(dict.fromkeys(urls))
def _passes_filters(feed: Feed, title: str, body: str) -> bool:
"""Keyword include/exclude check (case-insensitive)."""
haystack = f"{title}\n{body}".lower()
@@ -69,9 +131,43 @@ def _passes_filters(feed: Feed, title: str, body: str) -> bool:
return True
def _parse(url: str):
"""Blocking feedparser call (run in a thread)."""
return feedparser.parse(url, agent="rss-ntfy/1.0 (+https://github.com)")
def _parse_raw(xml: str):
"""Blocking feedparser call on XML string (run in a thread)."""
return feedparser.parse(StringIO(xml), agent="rss-ntfy/1.0 (+https://github.com)")
async def _fetch_feed(url: str, proxy: str = "") -> str:
"""Download feed XML via httpx (supports proxy)."""
kw = {"timeout": 30}
if proxy.strip():
kw["proxy"] = proxy.strip()
async with httpx.AsyncClient(**kw) as client:
resp = await client.get(
url,
headers={"User-Agent": "rss-ntfy/1.0 (+https://github.com)"},
)
resp.raise_for_status()
return resp.text
def _extract_full_article(url: str) -> tuple[str, str]:
"""Fetch page and extract main article text via trafilatura.
Returns (plain_text, html) or ("", "") on failure.
"""
try:
import trafilatura
downloaded = trafilatura.fetch_url(url)
if downloaded is None:
return "", ""
plain = trafilatura.extract(
downloaded, output_format="txt", with_metadata=False
) or ""
html = trafilatura.extract(
downloaded, output_format="xml", with_metadata=False
) or ""
return plain.strip(), html.strip()
except Exception:
return "", ""
async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:
@@ -79,7 +175,10 @@ async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:
Raises ValueError if the feed can't be parsed or has no matching entries.
"""
parsed = await asyncio.to_thread(_parse, url)
with Session(engine) as s:
proxy = get_settings(s).proxy_url
raw_xml = await _fetch_feed(url, proxy=proxy)
parsed = await asyncio.to_thread(_parse_raw, raw_xml)
if getattr(parsed, "bozo", False) and not parsed.entries:
raise ValueError(str(getattr(parsed, "bozo_exception", "parse error")))
if not parsed.entries:
@@ -104,7 +203,13 @@ async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:
async def check_feed(feed: Feed) -> str:
"""Check a single feed, dispatch new entries, log history. Returns status."""
parsed = await asyncio.to_thread(_parse, feed.url)
# Load settings early for proxy URL
with Session(engine) as _sess:
_settings = get_settings(_sess)
proxy_url = _settings.proxy_url
raw_xml = await _fetch_feed(feed.url, proxy=proxy_url)
parsed = await asyncio.to_thread(_parse_raw, raw_xml)
if getattr(parsed, "bozo", False) and not parsed.entries:
exc = getattr(parsed, "bozo_exception", "parse error")
@@ -145,7 +250,24 @@ async def check_feed(feed: Feed) -> str:
continue
title = entry.get("title", "(без заголовка)")
body = _strip_html(entry.get("summary") or entry.get("description") or "")
raw_html = entry.get("summary") or entry.get("description") or ""
link = entry.get("link", "")
full = db_feed.send_full_content
fetch_full = db_feed.fetch_full_article
body = _strip_html(raw_html, limit=0 if full else 1500)
# Trafilatura: extract full article text from the link page
if fetch_full and link and len(body) < 500:
try:
extra_text, extra_html = await asyncio.to_thread(
_extract_full_article, link
)
if extra_text:
body = extra_text
if extra_html:
raw_html = extra_html
except Exception:
pass # siliently fall back to RSS body
if not _passes_filters(db_feed, title, body):
skipped += 1
@@ -155,9 +277,78 @@ async def check_feed(feed: Feed) -> str:
source=db_feed.title or feed_title,
title=title,
body=body,
link=entry.get("link", ""),
link=link,
image=_extract_image(entry),
images=_extract_all_images(entry) if full else [],
full_html=raw_html if full else "",
videos=_extract_videos(entry) if full else [],
full_content=full,
)
# Store article for RSS reader (always, including first_run entries)
try:
existing_art = session.exec(
select(Article).where(
Article.feed_id == db_feed.id,
Article.link == link,
)
).first()
pub = entry.get("published_parsed")
pub_dt = None
if pub:
try:
pub_dt = datetime(*pub[:6], tzinfo=timezone.utc)
except Exception:
pass
if existing_art:
existing_art.title = title
existing_art.body = body
existing_art.full_html = raw_html
existing_art.image = msg.image
if pub_dt:
existing_art.published_at = pub_dt
session.add(existing_art)
else:
session.add(Article(
feed_id=db_feed.id,
feed_title=db_feed.title or feed_title,
title=title,
body=body,
full_html=raw_html,
link=link,
image=msg.image,
published_at=pub_dt,
))
except Exception:
pass # article storage is best-effort
# Digest mode: store instead of dispatching
if db_feed.digest_enabled:
session.add(DigestEntry(
feed_id=db_feed.id,
title=title,
link=link,
body=body,
image=msg.image,
full_html=raw_html if full else "",
images=json.dumps(msg.images) if full else "[]",
videos=json.dumps(msg.videos) if full else "[]",
full_content=full,
))
# Record as seen but skip dispatch
sent += 1
session.add(
Notification(
feed_id=db_feed.id,
feed_title=msg.source,
title=title,
link=msg.link,
channels="digest",
ok=True,
detail="queued for digest",
)
)
continue
result = await delivery.dispatch(db_feed, settings, msg)
session.add(