使用操纵符获取数据

问题描述 投票:1回答:1

我尝试使用Puppeteer从第一个搜索页面中获取图像的所有链接,但我从40个链接中仅获得6个链接。这是我的代码:

const puppeteer = require('puppeteer');

puppeteer.launch({ headless: true }).then(async browser => {
  const page = await browser.newPage();
  await page.goto('https://shopee.vn/search?keyword=iphone%20xs' , {waitUntil: 'networkidle0'});

  const links = await page.evaluate( () => {

    let products_result = document.getElementsByClassName("_1T9dHf _3XaILN");

    let images = [];

    for(let i=0; i<products_result.length; i++){
      images[i] = products_result[i].src;
    }

    return images;

  });

  for(let i=0; i<links.length;i++){
  console.log('Links of ' + i +' images : ',links[i]);
  }
  await browser.close();
});

如何解决,才能从第一个搜索页面获得40个链接?谢谢。

node.js web-scraping browser puppeteer headless
1个回答
0
投票

我相信所讨论的站点正在对图像进行一些延迟加载和/或某些按需DOM操作。

因此,我们将尝试使用此出色答案的改编向下滚动页面:

Puppeteer - scroll down until you can't anymore

[我们还要做的是拍摄页面图像(在运行脚本的目录中打开./page.png!),这可以帮助您查看正在加载的内容(或不加载!)。代码如下:

const puppeteer = require('puppeteer');

// Scroll downwards slowly
async function scroll(page){
    await page.evaluate(async () => {
        await new Promise(resolve => {
            // Adjust as necessary
            const y = 50, speed = 20;
            let heightScrolled = 0;

            setInterval(() => {
                window.scrollBy(0, y);
                heightScrolled += y;
                if (heightScrolled >= document.body.scrollHeight) {
                    resolve();
                }
            }, speed);
        });
    });
}

async function getImages(url) {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto(url, {waitUntil: 'networkidle0'});

    await page.setViewport({
        width: 1200,
        height: 800
    });

    await scroll(page);

    await page.screenshot({
    fullPage: true,
    path: `./page.png`
    });

    const links = await page.evaluate( () => {

        let products_result = document.getElementsByClassName("_1T9dHf _3XaILN");
        let images = [];

        for(let i=0; i<products_result.length; i++){
        images[i] = products_result[i].src;
        }
        return images;
    });

    for(let i=0; i<links.length;i++){
        console.log('Links of ' + i +' images : ',links[i]);
    }
    await browser.close();
}

let url = 'https://shopee.vn/search?keyword=iphone%20xs'
getImages(url);
© www.soinside.com 2019 - 2024. All rights reserved.