Files
rss-ntfy/app/checker.py
T
dimon bf52bc3079
build-and-push / docker (push) Has been cancelled
RSS → ntfy bridge with modern web UI
Features: feed CRUD, per-feed ntfy target (incl. private servers),
Telegram/webhook channels, keyword filters, image attachments,
per-feed intervals, OPML import/export, notification history & stats,
users with roles, admin alerts, RU/EN i18n, light/dark theme,
notification preview, history search, activity chart. Dockerized.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 21:11:57 +08:00

255 lines
9.0 KiB
Python

"""Background RSS polling and dispatch across channels."""
from __future__ import annotations
import asyncio
import logging
import re
from datetime import datetime, timezone
from html import unescape
import feedparser
from sqlmodel import Session, select
from . import delivery
from .database import engine, get_settings
from .delivery import Message
from .models import Feed, Notification, SeenEntry
log = logging.getLogger("checker")
_TAG_RE = re.compile(r"<[^>]+>")
_IMG_RE = re.compile(r'<img[^>]+src=["\']([^"\']+)["\']', re.IGNORECASE)
def _strip_html(text: str, limit: int = 1500) -> str:
text = unescape(_TAG_RE.sub(" ", text or ""))
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text).strip()
if len(text) > limit:
text = text[:limit].rsplit(" ", 1)[0] + ""
return text
def _entry_uid(entry) -> str:
for key in ("id", "guid", "link"):
value = entry.get(key)
if value:
return str(value)
return f"{entry.get('title', '')}|{entry.get('published', '')}"
def _extract_image(entry) -> str:
"""Best-effort: find an image URL in media tags, enclosures or HTML."""
media = entry.get("media_content") or entry.get("media_thumbnail")
if media and isinstance(media, list):
url = media[0].get("url")
if url:
return url
for link in entry.get("links", []):
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("image"):
return link.get("href", "")
html = entry.get("summary") or entry.get("description") or ""
if not html:
content = entry.get("content")
if content and isinstance(content, list):
html = content[0].get("value", "")
match = _IMG_RE.search(html or "")
return match.group(1) if match else ""
def _passes_filters(feed: Feed, title: str, body: str) -> bool:
"""Keyword include/exclude check (case-insensitive)."""
haystack = f"{title}\n{body}".lower()
includes = [k.strip().lower() for k in feed.filter_include.split(",") if k.strip()]
excludes = [k.strip().lower() for k in feed.filter_exclude.split(",") if k.strip()]
if includes and not any(k in haystack for k in includes):
return False
if excludes and any(k in haystack for k in excludes):
return False
return True
def _parse(url: str):
"""Blocking feedparser call (run in a thread)."""
return feedparser.parse(url, agent="rss-ntfy/1.0 (+https://github.com)")
async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:
"""Fetch a feed and return the newest entry passing filters, for previewing.
Raises ValueError if the feed can't be parsed or has no matching entries.
"""
parsed = await asyncio.to_thread(_parse, url)
if getattr(parsed, "bozo", False) and not parsed.entries:
raise ValueError(str(getattr(parsed, "bozo_exception", "parse error")))
if not parsed.entries:
raise ValueError("no entries")
probe = Feed(url=url, filter_include=include, filter_exclude=exclude)
feed_title = parsed.feed.get("title", "") if parsed.feed else ""
for entry in parsed.entries:
title = entry.get("title", "")
body = _strip_html(entry.get("summary") or entry.get("description") or "")
if not _passes_filters(probe, title, body):
continue
return {
"source": feed_title,
"title": title or "(no title)",
"body": body,
"image": _extract_image(entry),
"link": entry.get("link", ""),
}
raise ValueError("no entries match the filters")
async def check_feed(feed: Feed) -> str:
"""Check a single feed, dispatch new entries, log history. Returns status."""
parsed = await asyncio.to_thread(_parse, feed.url)
if getattr(parsed, "bozo", False) and not parsed.entries:
exc = getattr(parsed, "bozo_exception", "parse error")
status = f"parse_error:{exc}"
await _record_failure(feed.id, status)
return status
feed_title = parsed.feed.get("title", "") if parsed.feed else ""
with Session(engine) as session:
settings = get_settings(session)
db_feed = session.get(Feed, feed.id)
if db_feed is None:
return "Лента удалена"
if feed_title and not db_feed.title:
db_feed.title = feed_title
seen_uids = set(
session.exec(
select(SeenEntry.entry_uid).where(SeenEntry.feed_id == feed.id)
).all()
)
first_run = len(seen_uids) == 0
sent = 0
skipped = 0
# Oldest first so notifications arrive in chronological order.
for entry in reversed(parsed.entries):
uid = _entry_uid(entry)
if uid in seen_uids:
continue
seen_uids.add(uid)
session.add(SeenEntry(feed_id=feed.id, entry_uid=uid))
# On the very first check we only record state, never spam history.
if first_run:
continue
title = entry.get("title", "(без заголовка)")
body = _strip_html(entry.get("summary") or entry.get("description") or "")
if not _passes_filters(db_feed, title, body):
skipped += 1
continue
msg = Message(
source=db_feed.title or feed_title,
title=title,
body=body,
link=entry.get("link", ""),
image=_extract_image(entry),
)
result = await delivery.dispatch(db_feed, settings, msg)
session.add(
Notification(
feed_id=db_feed.id,
feed_title=msg.source,
title=title,
link=msg.link,
channels=",".join(result.channels),
ok=result.ok,
detail=result.detail,
)
)
if result.ok:
sent += 1
elif not result.channels:
# Hard failure (e.g. ntfy unreachable) — surface it and stop.
db_feed.last_checked = datetime.now(timezone.utc)
db_feed.last_status = f"send_error:{result.detail}"
db_feed.error_streak += 1
session.commit()
await _maybe_alert(db_feed.id)
return db_feed.last_status
db_feed.last_checked = datetime.now(timezone.utc)
db_feed.error_streak = 0
if first_run:
db_feed.last_status = f"init:{len(seen_uids)}"
elif sent:
db_feed.last_status = f"sent:{sent}:{skipped}" if skipped else f"sent:{sent}"
elif skipped:
db_feed.last_status = f"filtered:{skipped}"
else:
db_feed.last_status = "nochange"
session.commit()
return db_feed.last_status
async def _record_failure(feed_id: int, status: str) -> None:
with Session(engine) as session:
db_feed = session.get(Feed, feed_id)
if db_feed is None:
return
db_feed.last_checked = datetime.now(timezone.utc)
db_feed.last_status = status
db_feed.error_streak += 1
session.commit()
await _maybe_alert(feed_id)
async def _maybe_alert(feed_id: int) -> None:
"""Send an admin alert if a feed has failed too many times in a row."""
with Session(engine) as session:
settings = get_settings(session)
db_feed = session.get(Feed, feed_id)
if db_feed is None or not settings.alerts_enabled:
return
# Alert once, exactly when the streak crosses the threshold.
if db_feed.error_streak == settings.alert_threshold:
text = (
f"Feed \"{db_feed.title or db_feed.url}\" is failing "
f"({db_feed.error_streak} consecutive errors)."
)
await delivery.send_admin_alert(settings, text)
async def check_all_feeds() -> None:
"""Check feeds whose per-feed interval has elapsed (1-minute tick)."""
now = datetime.now(timezone.utc)
with Session(engine) as session:
settings = get_settings(session)
feeds = session.exec(select(Feed).where(Feed.enabled == True)).all() # noqa: E712
default_interval = settings.check_interval
due: list[Feed] = []
for feed in feeds:
interval = feed.interval if feed.interval and feed.interval > 0 else default_interval
if feed.last_checked is None:
due.append(feed)
continue
last = feed.last_checked
if last.tzinfo is None:
last = last.replace(tzinfo=timezone.utc)
if (now - last).total_seconds() >= interval * 60:
due.append(feed)
if not due:
return
log.info("Проверка %d из %d лент", len(due), len(feeds))
for feed in due:
try:
await check_feed(feed)
except Exception as exc: # noqa: BLE001
log.exception("Ошибка проверки ленты %s: %s", feed.url, exc)