我有一个简单的 Flask 应用程序,可以从一些 RSS 提要中获取文章。这是代码:
import requests
import xml.etree.ElementTree as ET
from dateutil import parser
import re
from feeds_config import FEEDS
def extract_image_from_content(content):
"""Extract the first image URL from the content using regex."""
match = re.search(r'<img[^>]+src="([^">]+)"', content)
return match.group(1) if match else None
def fetch_articles():
"""
Fetch articles from the feeds listed in FEEDS configuration.
For each feed, parse the RSS feed and extract relevant article details.
"""
articles = []
for feed in FEEDS:
response =requests.get(feed['url']) # Fetch the RSS feed
if response.status_code == 200:
root = ET.fromstring(response.content) # Parse XML content
ns = feed.get('image_ns', {}) # Get namespaces for images
content_ns = feed.get('content_ns', {}) # Get namespaces for content
# List to temporarily store feed articles
feed_articles = []
# Iterate through each item (article) in the feed
for item in root.findall(".//item"):
title = item.find("title").text # Extract article title
link = item.find("link").text # Extract article link
pub_date = item.find("pubDate").text # Extract publication date
timestamp = parser.parse(pub_date) # Parse date to a datetime object
# Extract image URL from <enclosure> or other image tags if available
image = item.find(feed.get('image_xpath', '.'), namespaces=ns)
image_url = image.get("url") if image is not None else None
# If no image found, attempt to extract it from content
if not image_url and feed.get('content_xpath'):
content = item.find(feed['content_xpath'], namespaces=content_ns)
content_text = content.text if content is not None else ""
image_url = extract_image_from_content(content_text)
# Append article details to feed_articles list
feed_articles.append({
"title": title,
"link": link,
"timestamp": timestamp,
"source": feed['source'],
"image": image_url,
"source_url": feed['source_url']
})
# Remove duplicate if the first two items have the same title
if len(feed_articles) > 1 and feed_articles[0]['title'] == feed_articles[1]['title']:
feed_articles.pop(0)
# Add the remaining articles to the main articles list
articles.extend(feed_articles)
return articles
我在这些 JSON 对象上运行它:
FEEDS = [
{
'url': 'https://hedgehogreview.com/web-features/feed',
'source': 'Hedgehog Review',
'source_url': 'https://hedgehogreview.com/', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {},
'content_xpath': './content:encoded',
'content_ns': {'content':
'http:/purl.org/rss/1.0/modules/content/'}
},
{
'url': 'https://mcrawford.substack.com/feed',
'source': 'M.B. Crawford Substack',
'source_url': 'https://mcrawford.substack.com', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {}
},
{
'url': 'https://mattdinan.substack.com/feed',
'source': 'Matt Dinan Substack',
'source_url': 'https://mattdinan.substack.com', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {}
}
]
当我在本地运行它时,它们都会加载。当我在免费的 Azure 应用服务中运行它时,仅加载 Hedgehog Review。当托管在云中时,代码无法从 Substack 提取 RSS 提要是否有原因?
我已检查是否允许入站和出站流量(当然是允许的,因为我可以访问该站点并且某些对象正在加载)。我已验证所有依赖项均位于 dependency.txt 中并且部署成功。