✨ 8 major features: trafilatura, digest, ntfy actions, templates, FTS5 search, backup/restore, proxy, RSS reader
build-and-push / docker (push) Has been cancelled
build-and-push / docker (push) Has been cancelled
- Full article extraction via trafilatura (fetch_full_article)
- Digest mode with configurable period (digest_enabled, digest_period_hours)
- ntfy Actions buttons (Open article, Open feed)
- Notification templates with {title}, {body}, {link}, {source}, {image_url}
- FTS5 full-text search in notification history
- Database backup/restore (download/upload .db)
- HTTP/SOCKS proxy for RSS feed fetching (proxy_url setting)
- Built-in RSS reader tab with categories, unread counts, article detail view
- Auto-category 'Общее' for feeds without a category
- Article storage (Article table) for reader
- DigestEntry model for pending digest entries
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+200
-9
@@ -2,30 +2,38 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
|
||||
import httpx
|
||||
from io import StringIO
|
||||
|
||||
import feedparser
|
||||
from sqlmodel import Session, select
|
||||
|
||||
from . import delivery
|
||||
from .database import engine, get_settings
|
||||
from .delivery import Message
|
||||
from .models import Feed, Notification, SeenEntry
|
||||
from .models import Article, DigestEntry, Feed, Notification, SeenEntry
|
||||
|
||||
log = logging.getLogger("checker")
|
||||
|
||||
_TAG_RE = re.compile(r"<[^>]+>")
|
||||
_IMG_RE = re.compile(r'<img[^>]+src=["\']([^"\']+)["\']', re.IGNORECASE)
|
||||
_VIDEO_RE = re.compile(
|
||||
r'<(?:video|iframe|source|embed)[^>]*src=["\']([^"\']+)["\']', re.IGNORECASE
|
||||
)
|
||||
_ENC_VIDEO_RE = re.compile(r'<(?:video|iframe|source|embed)[^>]*>', re.IGNORECASE)
|
||||
|
||||
|
||||
def _strip_html(text: str, limit: int = 1500) -> str:
|
||||
text = unescape(_TAG_RE.sub(" ", text or ""))
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text).strip()
|
||||
if len(text) > limit:
|
||||
if limit and len(text) > limit:
|
||||
text = text[:limit].rsplit(" ", 1)[0] + " …"
|
||||
return text
|
||||
|
||||
@@ -57,6 +65,60 @@ def _extract_image(entry) -> str:
|
||||
return match.group(1) if match else ""
|
||||
|
||||
|
||||
def _extract_all_images(entry) -> list[str]:
|
||||
"""Extract ALL image URLs from a feed entry (deduplicated, order preserved)."""
|
||||
urls: list[str] = []
|
||||
|
||||
# media_content / media_thumbnail
|
||||
for key in ("media_content", "media_thumbnail"):
|
||||
media = entry.get(key)
|
||||
if media and isinstance(media, list):
|
||||
for item in media:
|
||||
url = item.get("url")
|
||||
if url:
|
||||
urls.append(url)
|
||||
|
||||
# enclosure links with image/ type
|
||||
for link in entry.get("links", []):
|
||||
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("image"):
|
||||
href = link.get("href", "")
|
||||
if href:
|
||||
urls.append(href)
|
||||
|
||||
# <img> tags in HTML body
|
||||
html = entry.get("summary") or entry.get("description") or ""
|
||||
if not html:
|
||||
content = entry.get("content")
|
||||
if content and isinstance(content, list):
|
||||
html = content[0].get("value", "")
|
||||
urls.extend(_IMG_RE.findall(html or ""))
|
||||
|
||||
# deduplicate preserving order
|
||||
return list(dict.fromkeys(urls))
|
||||
|
||||
|
||||
def _extract_videos(entry) -> list[str]:
|
||||
"""Extract video/multimedia URLs from a feed entry."""
|
||||
urls: list[str] = []
|
||||
|
||||
# enclosure links with video/ type
|
||||
for link in entry.get("links", []):
|
||||
if link.get("rel") == "enclosure" and str(link.get("type", "")).startswith("video"):
|
||||
href = link.get("href", "")
|
||||
if href:
|
||||
urls.append(href)
|
||||
|
||||
# <video>, <iframe>, <source>, <embed> tags in HTML body
|
||||
html = entry.get("summary") or entry.get("description") or ""
|
||||
if not html:
|
||||
content = entry.get("content")
|
||||
if content and isinstance(content, list):
|
||||
html = content[0].get("value", "")
|
||||
urls.extend(_VIDEO_RE.findall(html or ""))
|
||||
|
||||
return list(dict.fromkeys(urls))
|
||||
|
||||
|
||||
def _passes_filters(feed: Feed, title: str, body: str) -> bool:
|
||||
"""Keyword include/exclude check (case-insensitive)."""
|
||||
haystack = f"{title}\n{body}".lower()
|
||||
@@ -69,9 +131,43 @@ def _passes_filters(feed: Feed, title: str, body: str) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
def _parse(url: str):
|
||||
"""Blocking feedparser call (run in a thread)."""
|
||||
return feedparser.parse(url, agent="rss-ntfy/1.0 (+https://github.com)")
|
||||
def _parse_raw(xml: str):
|
||||
"""Blocking feedparser call on XML string (run in a thread)."""
|
||||
return feedparser.parse(StringIO(xml), agent="rss-ntfy/1.0 (+https://github.com)")
|
||||
|
||||
|
||||
async def _fetch_feed(url: str, proxy: str = "") -> str:
|
||||
"""Download feed XML via httpx (supports proxy)."""
|
||||
kw = {"timeout": 30}
|
||||
if proxy.strip():
|
||||
kw["proxy"] = proxy.strip()
|
||||
async with httpx.AsyncClient(**kw) as client:
|
||||
resp = await client.get(
|
||||
url,
|
||||
headers={"User-Agent": "rss-ntfy/1.0 (+https://github.com)"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
|
||||
|
||||
def _extract_full_article(url: str) -> tuple[str, str]:
|
||||
"""Fetch page and extract main article text via trafilatura.
|
||||
Returns (plain_text, html) or ("", "") on failure.
|
||||
"""
|
||||
try:
|
||||
import trafilatura
|
||||
downloaded = trafilatura.fetch_url(url)
|
||||
if downloaded is None:
|
||||
return "", ""
|
||||
plain = trafilatura.extract(
|
||||
downloaded, output_format="txt", with_metadata=False
|
||||
) or ""
|
||||
html = trafilatura.extract(
|
||||
downloaded, output_format="xml", with_metadata=False
|
||||
) or ""
|
||||
return plain.strip(), html.strip()
|
||||
except Exception:
|
||||
return "", ""
|
||||
|
||||
|
||||
async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:
|
||||
@@ -79,7 +175,10 @@ async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:
|
||||
|
||||
Raises ValueError if the feed can't be parsed or has no matching entries.
|
||||
"""
|
||||
parsed = await asyncio.to_thread(_parse, url)
|
||||
with Session(engine) as s:
|
||||
proxy = get_settings(s).proxy_url
|
||||
raw_xml = await _fetch_feed(url, proxy=proxy)
|
||||
parsed = await asyncio.to_thread(_parse_raw, raw_xml)
|
||||
if getattr(parsed, "bozo", False) and not parsed.entries:
|
||||
raise ValueError(str(getattr(parsed, "bozo_exception", "parse error")))
|
||||
if not parsed.entries:
|
||||
@@ -104,7 +203,13 @@ async def fetch_preview(url: str, include: str = "", exclude: str = "") -> dict:
|
||||
|
||||
async def check_feed(feed: Feed) -> str:
|
||||
"""Check a single feed, dispatch new entries, log history. Returns status."""
|
||||
parsed = await asyncio.to_thread(_parse, feed.url)
|
||||
# Load settings early for proxy URL
|
||||
with Session(engine) as _sess:
|
||||
_settings = get_settings(_sess)
|
||||
proxy_url = _settings.proxy_url
|
||||
|
||||
raw_xml = await _fetch_feed(feed.url, proxy=proxy_url)
|
||||
parsed = await asyncio.to_thread(_parse_raw, raw_xml)
|
||||
|
||||
if getattr(parsed, "bozo", False) and not parsed.entries:
|
||||
exc = getattr(parsed, "bozo_exception", "parse error")
|
||||
@@ -145,7 +250,24 @@ async def check_feed(feed: Feed) -> str:
|
||||
continue
|
||||
|
||||
title = entry.get("title", "(без заголовка)")
|
||||
body = _strip_html(entry.get("summary") or entry.get("description") or "")
|
||||
raw_html = entry.get("summary") or entry.get("description") or ""
|
||||
link = entry.get("link", "")
|
||||
full = db_feed.send_full_content
|
||||
fetch_full = db_feed.fetch_full_article
|
||||
body = _strip_html(raw_html, limit=0 if full else 1500)
|
||||
|
||||
# Trafilatura: extract full article text from the link page
|
||||
if fetch_full and link and len(body) < 500:
|
||||
try:
|
||||
extra_text, extra_html = await asyncio.to_thread(
|
||||
_extract_full_article, link
|
||||
)
|
||||
if extra_text:
|
||||
body = extra_text
|
||||
if extra_html:
|
||||
raw_html = extra_html
|
||||
except Exception:
|
||||
pass # siliently fall back to RSS body
|
||||
|
||||
if not _passes_filters(db_feed, title, body):
|
||||
skipped += 1
|
||||
@@ -155,9 +277,78 @@ async def check_feed(feed: Feed) -> str:
|
||||
source=db_feed.title or feed_title,
|
||||
title=title,
|
||||
body=body,
|
||||
link=entry.get("link", ""),
|
||||
link=link,
|
||||
image=_extract_image(entry),
|
||||
images=_extract_all_images(entry) if full else [],
|
||||
full_html=raw_html if full else "",
|
||||
videos=_extract_videos(entry) if full else [],
|
||||
full_content=full,
|
||||
)
|
||||
|
||||
# Store article for RSS reader (always, including first_run entries)
|
||||
try:
|
||||
existing_art = session.exec(
|
||||
select(Article).where(
|
||||
Article.feed_id == db_feed.id,
|
||||
Article.link == link,
|
||||
)
|
||||
).first()
|
||||
pub = entry.get("published_parsed")
|
||||
pub_dt = None
|
||||
if pub:
|
||||
try:
|
||||
pub_dt = datetime(*pub[:6], tzinfo=timezone.utc)
|
||||
except Exception:
|
||||
pass
|
||||
if existing_art:
|
||||
existing_art.title = title
|
||||
existing_art.body = body
|
||||
existing_art.full_html = raw_html
|
||||
existing_art.image = msg.image
|
||||
if pub_dt:
|
||||
existing_art.published_at = pub_dt
|
||||
session.add(existing_art)
|
||||
else:
|
||||
session.add(Article(
|
||||
feed_id=db_feed.id,
|
||||
feed_title=db_feed.title or feed_title,
|
||||
title=title,
|
||||
body=body,
|
||||
full_html=raw_html,
|
||||
link=link,
|
||||
image=msg.image,
|
||||
published_at=pub_dt,
|
||||
))
|
||||
except Exception:
|
||||
pass # article storage is best-effort
|
||||
|
||||
# Digest mode: store instead of dispatching
|
||||
if db_feed.digest_enabled:
|
||||
session.add(DigestEntry(
|
||||
feed_id=db_feed.id,
|
||||
title=title,
|
||||
link=link,
|
||||
body=body,
|
||||
image=msg.image,
|
||||
full_html=raw_html if full else "",
|
||||
images=json.dumps(msg.images) if full else "[]",
|
||||
videos=json.dumps(msg.videos) if full else "[]",
|
||||
full_content=full,
|
||||
))
|
||||
# Record as seen but skip dispatch
|
||||
sent += 1
|
||||
session.add(
|
||||
Notification(
|
||||
feed_id=db_feed.id,
|
||||
feed_title=msg.source,
|
||||
title=title,
|
||||
link=msg.link,
|
||||
channels="digest",
|
||||
ok=True,
|
||||
detail="queued for digest",
|
||||
)
|
||||
)
|
||||
continue
|
||||
result = await delivery.dispatch(db_feed, settings, msg)
|
||||
|
||||
session.add(
|
||||
|
||||
Reference in New Issue
Block a user