"""Background RSS polling and dispatch across channels.""" from __future__ import annotations import asyncio import logging import re from datetime import datetime, timezone from html import unescape import feedparser from sqlmodel import Session, select from . import delivery from .database import engine, get_settings from .delivery import Message from .models import Feed, Notification, SeenEntry log = logging.getLogger("checker") _TAG_RE = re.compile(r"<[^>]+>") _IMG_RE = re.compile(r']+src=["\']([^"\']+)["\']', re.IGNORECASE) def _strip_html(text: str, limit: int = 1500) -> str: text = unescape(_TAG_RE.sub(" ", text or "")) text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text).strip() if len(text) > limit: text = text[:limit].rsplit(" ", 1)[0] + " …" return text def _entry_uid(entry) -> str: for key in ("id", "guid", "link"): value = entry.get(key) if value: return str(value) return f"{entry.get('title', '')}|{entry.get('published', '')}" def _extract_image(entry) -> str: """Best-effort: find an image URL in media tags, enclosures or HTML.""" media = entry.get("media_content") or entry.get("media_thumbnail") if media and isinstance(media, list): url = media[0].get("url") if url: return url for link in entry.get("links", []): if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("image"): return link.get("href", "") html = entry.get("summary") or entry.get("description") or "" if not html: content = entry.get("content") if content and isinstance(content, list): html = content[0].get("value", "") match = _IMG_RE.search(html or "") return match.group(1) if match else "" def _passes_filters(feed: Feed, title: str, body: str) -> bool: """Keyword include/exclude check (case-insensitive).""" haystack = f"{title}\n{body}".lower() includes = [k.strip().lower() for k in feed.filter_include.split(",") if k.strip()] excludes = [k.strip().lower() for k in feed.filter_exclude.split(",") if k.strip()] if includes and not any(k in haystack for k in includes): return False if excludes and any(k in haystack for k in excludes): return False return True def _parse(url: str): """Blocking feedparser call (run in a thread).""" return feedparser.parse(url, agent="rss-ntfy/1.0 (+https://github.com)") async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict: """Fetch a feed and return the newest entry passing filters, for previewing. Raises ValueError if the feed can't be parsed or has no matching entries. """ parsed = await asyncio.to_thread(_parse, url) if getattr(parsed, "bozo", False) and not parsed.entries: raise ValueError(str(getattr(parsed, "bozo_exception", "parse error"))) if not parsed.entries: raise ValueError("no entries") probe = Feed(url=url, filter_include=include, filter_exclude=exclude) feed_title = parsed.feed.get("title", "") if parsed.feed else "" for entry in parsed.entries: title = entry.get("title", "") body = _strip_html(entry.get("summary") or entry.get("description") or "") if not _passes_filters(probe, title, body): continue return { "source": feed_title, "title": title or "(no title)", "body": body, "image": _extract_image(entry), "link": entry.get("link", ""), } raise ValueError("no entries match the filters") async def check_feed(feed: Feed) -> str: """Check a single feed, dispatch new entries, log history. Returns status.""" parsed = await asyncio.to_thread(_parse, feed.url) if getattr(parsed, "bozo", False) and not parsed.entries: exc = getattr(parsed, "bozo_exception", "parse error") status = f"parse_error:{exc}" await _record_failure(feed.id, status) return status feed_title = parsed.feed.get("title", "") if parsed.feed else "" with Session(engine) as session: settings = get_settings(session) db_feed = session.get(Feed, feed.id) if db_feed is None: return "Лента удалена" if feed_title and not db_feed.title: db_feed.title = feed_title seen_uids = set( session.exec( select(SeenEntry.entry_uid).where(SeenEntry.feed_id == feed.id) ).all() ) first_run = len(seen_uids) == 0 sent = 0 skipped = 0 # Oldest first so notifications arrive in chronological order. for entry in reversed(parsed.entries): uid = _entry_uid(entry) if uid in seen_uids: continue seen_uids.add(uid) session.add(SeenEntry(feed_id=feed.id, entry_uid=uid)) # On the very first check we only record state, never spam history. if first_run: continue title = entry.get("title", "(без заголовка)") body = _strip_html(entry.get("summary") or entry.get("description") or "") if not _passes_filters(db_feed, title, body): skipped += 1 continue msg = Message( source=db_feed.title or feed_title, title=title, body=body, link=entry.get("link", ""), image=_extract_image(entry), ) result = await delivery.dispatch(db_feed, settings, msg) session.add( Notification( feed_id=db_feed.id, feed_title=msg.source, title=title, link=msg.link, channels=",".join(result.channels), ok=result.ok, detail=result.detail, ) ) if result.ok: sent += 1 elif not result.channels: # Hard failure (e.g. ntfy unreachable) — surface it and stop. db_feed.last_checked = datetime.now(timezone.utc) db_feed.last_status = f"send_error:{result.detail}" db_feed.error_streak += 1 session.commit() await _maybe_alert(db_feed.id) return db_feed.last_status db_feed.last_checked = datetime.now(timezone.utc) db_feed.error_streak = 0 if first_run: db_feed.last_status = f"init:{len(seen_uids)}" elif sent: db_feed.last_status = f"sent:{sent}:{skipped}" if skipped else f"sent:{sent}" elif skipped: db_feed.last_status = f"filtered:{skipped}" else: db_feed.last_status = "nochange" session.commit() return db_feed.last_status async def _record_failure(feed_id: int, status: str) -> None: with Session(engine) as session: db_feed = session.get(Feed, feed_id) if db_feed is None: return db_feed.last_checked = datetime.now(timezone.utc) db_feed.last_status = status db_feed.error_streak += 1 session.commit() await _maybe_alert(feed_id) async def _maybe_alert(feed_id: int) -> None: """Send an admin alert if a feed has failed too many times in a row.""" with Session(engine) as session: settings = get_settings(session) db_feed = session.get(Feed, feed_id) if db_feed is None or not settings.alerts_enabled: return # Alert once, exactly when the streak crosses the threshold. if db_feed.error_streak == settings.alert_threshold: text = ( f"Feed \"{db_feed.title or db_feed.url}\" is failing " f"({db_feed.error_streak} consecutive errors)." ) await delivery.send_admin_alert(settings, text) async def check_all_feeds() -> None: """Check feeds whose per-feed interval has elapsed (1-minute tick).""" now = datetime.now(timezone.utc) with Session(engine) as session: settings = get_settings(session) feeds = session.exec(select(Feed).where(Feed.enabled == True)).all() # noqa: E712 default_interval = settings.check_interval due: list[Feed] = [] for feed in feeds: interval = feed.interval if feed.interval and feed.interval > 0 else default_interval if feed.last_checked is None: due.append(feed) continue last = feed.last_checked if last.tzinfo is None: last = last.replace(tzinfo=timezone.utc) if (now - last).total_seconds() >= interval * 60: due.append(feed) if not due: return log.info("Проверка %d из %d лент", len(due), len(feeds)) for feed in due: try: await check_feed(feed) except Exception as exc: # noqa: BLE001 log.exception("Ошибка проверки ленты %s: %s", feed.url, exc)