我正在尝试使用 Node.js 和 Cheerio 抓取 Reddit 帖子和评论,但没有得到预期的结果。以下是我的设置和我面临的问题的简要概述。
设置:
Node.js:v14.17.0
Cheerio:v1.0.0-rc.10
请求-承诺:v4.2.6
代码:
const rp = require('request-promise');
const cheerio = require('cheerio');
const url = 'https://www.reddit.com/r/javascript/';
rp(url)
.then(html => {
const $ = cheerio.load(html);
let posts = [];
$('div.Post').each((i, element) => {
let title = $(element).find('h3').text();
let commentsLink = $(element).find('a[data-click-id="comments"]').attr('href');
posts.push({ title, commentsLink });
});
console.log(posts);
})
.catch(err => {
console.error(err);
});
如果您使用代码查看从网站检索的内容,您会发现 Reddit 正在阻止您的请求。
您需要评估页面上的动态内容,并且至少看起来是一个浏览器。您可以使用 Puppeteer 来完成此操作。
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36');
await page.goto('https://www.reddit.com/r/javascript/');
await page.waitForSelector('h1');
const data = await page.evaluate(() => {
// Get all articles on page.
//
// ! Without pagination there will just be the top three articles.
//
const articles = Array.from(document.querySelectorAll('article'));
// Extract content from each article.
//
// ! You can get some text but it's just a summary of the article, not full content.
//
const content = articles.map(article => {
return {
title: article.querySelector('a').innerText
}
});
return content;
});
console.log(data);
await browser.close();
})();
输出示例:
[
{ title: 'WTF Wednesday (June 19, 2024)' },
{
title: 'Your /r/javascript recap for the week of June 10 - June 16, 2024'
},
{ title: 'Announcing TypeScript 5.5' }
]