Puppeteer:向下滚动Twitter时间线停止

问题描述 投票:1回答:1

我在使用puppeteer在用户时间线上抓取所有推文网址时遇到问题。

使用puppeteer,脚本应该在scrollToEnd函数的while循环的每次迭代中向下滚动时间轴,直到它到达底部。为了监视进度,我让脚本输出了previousHeight变量的值,这是在执行滚动之前每次评估的scrollheight的当前document.body

但是,一旦输出值变为285,834,滚动就会停止。令人费解的是,脚本既没有突破while循环,也没有page.waitForFunction方法抛出超时错误。

我应该如何重写scrollToEnd函数或脚本的任何其他部分,以便函数正确结束?

这是我的代码片段。为简洁起见,省略了不相关的功能。

const puppeteer = require('puppeteer');

var UserUrls = ['https://twitter.com/someuser'];

// more functions here

async function scrollToEnd(
    page,
    ScrollDelay = 1000
) {
    try {
        let previousHeight = 0;
        let notEnd = await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
        while (notEnd) {
            previousHeight = await page.evaluate('document.body.scrollHeight');
            await page.evaluate('window.scrollBy(0, document.body.scrollHeight)');
            await page.waitFor(ScrollDelay);

            notEnd = await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
            console.log(previousHeight)
        };
        return;
    } catch (e) {
        return;
    };
};

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    var tweetUrls = [];
    for (let UserUrl of UserUrls) {
        await page.goto(UserUrl);
        await page.evaluate((async () => {
            await scrollToEnd(page);
        })());
        await page.screenshot({ path: 'PageEnd.png' });
        tweetUrls = await getTweetUrls(page, extractItems, 100);
    };
    await browser.close();
    console.log(tweetUrls);
})();
javascript node.js twitter web-scraping puppeteer
1个回答
0
投票

你能试试这两种方法中的一种吗?此脚本尝试通过比较滚动高度(如您所做)或等待标记流结束的元素可见而滚动到底部。所有滚动逻辑都放在浏览器上下文中评估的函数内。这两个函数都在整页中返回推文计数,以将结果与在时间线顶部声明的用户推文计数进行比较。此外,我已经将第一种方法的延迟更改为3秒,因为看起来有时1秒对于滚动高度的改变来说太小了。

'use strict';

const puppeteer = require('puppeteer');

(async function main() {
  try {
    const browser = await puppeteer.launch({ headless: false });
    const [page] = await browser.pages();

    await page.goto('https://twitter.com/GHchangelog');
    const data1 = await page.evaluate(scrollToBottomByMaxHeight);
    console.log(`Tweets: ${data1}`);

    await page.goto('https://twitter.com/GHchangelog');
    const data2 = await page.evaluate(scrollToBottomByEndElement);
    console.log(`Tweets: ${data2}`);

    // await browser.close();
  } catch (err) {
    console.error(err);
  }
})();

async function scrollToBottomByMaxHeight() {
  try {
    let previousHeight = 0;
    let currentHeight = document.scrollingElement.scrollHeight;

    while (previousHeight < currentHeight) {
      previousHeight = document.scrollingElement.scrollHeight;
      window.scrollBy(0, previousHeight);
      await new Promise((resolve) => { setTimeout(resolve, 3000); });
      currentHeight = document.scrollingElement.scrollHeight;
    }

    return document.querySelectorAll('a.js-permalink').length;
  } catch (err) {
    return err;
  }
}

async function scrollToBottomByEndElement() {
  try {
    const endElement = document.querySelector('div.stream-end');

    while (endElement.clientHeight === 0) {
      window.scrollBy(0, document.scrollingElement.scrollHeight);
      await new Promise((resolve) => { setTimeout(resolve, 1000); });
    }

    return document.querySelectorAll('a.js-permalink').length;
  } catch (err) {
    return err;
  }
}
© www.soinside.com 2019 - 2024. All rights reserved.