Puppeteer:已达到 Page.navigate 限制

问题描述 投票:0回答:1

我的Json医生姓名、个人资料页面列表

{
 {
    "name": "Herr Prof. Dr. med. Armin Quentmeyer",
    "link": "https://www.doctolib.de/orthopadie/ludwigshafen-am-rhein/dr-prof-armin-quentmeyer?pid=practice-238444"
  },
  {
    "name": "Herr Dr. med. Ralph Hower",
    "link": "https://www.doctolib.de/orthopadie/murnau-am-staffelsee/ralph-hower?pid=practice-228573"
  },
  {
    "name": "Orthopädische Praxis Ludwigsburg",
    "link": "https://www.doctolib.de/praxis/ludwigsburg/orthopaedische-praxis-ludwigsburg?pid=practice-570039"
  }
}
// website_adder.js
import puppeteer from "puppeteer-core";
import fs from 'fs';

async function addWebsites() {
    let browser;
    try {
        console.log("Starting function");
        const auth = '***';
        
        browser = await puppeteer.connect({
            browserWSEndpoint: `wss://${auth}@***`
        });

        // Read the JSON file with the list of doctors
        const data = fs.readFileSync('uniqueDoctorsData.json', 'utf-8');
        const doctorsData = JSON.parse(data);

        // Iterate through each doctor, visit their profile, and add the website to their data
        for (let doctor of doctorsData) {
            console.log(`Visiting profile of ${doctor.name} at ${doctor.link}`);

            try {
                // Introduce a short delay between navigations to avoid hitting the page limit
                await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 10 seconds
                console.log("Waited for 2000ms");

                // Use a new page for each doctor's profile
                const profilePage = await browser.newPage();
                await profilePage.goto(doctor.link, { waitUntil: 'networkidle2' });

                // Extract the doctor's website from their profile page
                const website = await profilePage.evaluate(() => {
                    const websiteElement = document.querySelector('a[rel="nofollow"][target="_blank"]');
                    return websiteElement ? websiteElement.href : null;
                });

                // Add the website to the doctor's data
                doctor.website = website;

                console.log(`Extracted website for ${doctor.name}: ${website}`);

                // Close the profile page to free up resources
                await profilePage.close();

                // Save the updated doctors data back to the JSON file immediately after processing each doctor
                fs.writeFileSync('uniqueDoctorsData.json', JSON.stringify(doctorsData, null, 2), 'utf-8');
                console.log(`Saved updated data for ${doctor.name} to JSON file`);

            } catch (error) {
                console.error(`Failed to navigate to ${doctor.link}`, error);
            }
        }

        console.log('All doctor profiles processed and data saved to updatedDoctorsData.json');

    } catch (e) {
        console.error('Failed to add websites', e);
    } finally {
        if (browser) {
            await browser.close();
        }
    }
}

// Export the function to be used in index.js
export { addWebsites };

我有一个简单的函数,可以根据一个 json 文件从给定的网站中提取网站,在该文件中我收集了医生的姓名和个人资料页面。但无论我尝试什么,我总会得到回报

Page.navigate limit reached

错误。在谷歌和这里我在其他地方没有发现这个错误。非常感谢帮助!!

我尝试在没有循环和递归循环的情况下一页一页地调用页面。当我第一次访问循环之前的页面然后在循环内部时,我可以连续访问 2 个页面...

编辑最小可重现示例:

// website_adder.js
import puppeteer from "puppeteer";
import fs from 'fs';

async function addWebsites() {
    let browser;
    try {
        console.log("Starting function");

        // Launch a new browser instance (no proxy connection)
        browser = await puppeteer.launch({
            headless: true, // Run in headless mode for efficiency
            args: ['--no-sandbox', '--disable-setuid-sandbox'] // Additional arguments to improve performance/stability
        });

        // Read the JSON file with the list of doctors
        const data = fs.readFileSync('uniqueDoctorsData.json', 'utf-8');
        const doctorsData = JSON.parse(data);

        // Iterate through each doctor, visit their profile, and add the website to their data
        for (let doctor of doctorsData) {
            // Skip doctors who already have a website to avoid unnecessary work
            if (doctor.website) {
                console.log(`Skipping ${doctor.name} as website is already added.`);
                continue;
            }

            console.log(`Visiting profile of ${doctor.name} at ${doctor.link}`);

            try {
                // Introduce a short delay between navigations to avoid hitting the page limit
                await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds
                console.log("Waited for 2000ms");

                // Use a new page for each doctor's profile
                const profilePage = await browser.newPage();
                await profilePage.goto(doctor.link, { waitUntil: 'networkidle2' });

                // Extract the doctor's website from their profile page
                const website = await profilePage.evaluate(() => {
                    const websiteElement = document.querySelector('a[rel="nofollow"][target="_blank"]');
                    return websiteElement ? websiteElement.href : null;
                });

                // Add the website to the doctor's data
                doctor.website = website;

                console.log(`Extracted website for ${doctor.name}: ${website}`);

                // Close the profile page to free up resources
                await profilePage.close();

                // Save the updated doctors data back to the JSON file immediately after processing each doctor
                fs.writeFileSync('uniqueDoctorsData.json', JSON.stringify(doctorsData, null, 2), 'utf-8');
                console.log(`Saved updated data for ${doctor.name} to JSON file`);

            } catch (error) {
                console.error(`Failed to navigate to ${doctor.link}`, error);
            }
        }

        console.log('All doctor profiles processed and data saved to uniqueDoctorsData.json');

    } catch (e) {
        console.error('Failed to add websites', e);
    } finally {
        if (browser) {
            await browser.close();
        }

        // After processing the current batch, check again if there are any entries left without websites
        const updatedData = fs.readFileSync('uniqueDoctorsData.json', 'utf-8');
        const updatedDoctorsData = JSON.parse(updatedData);
        const remainingDoctors = updatedDoctorsData.filter(doctor => !doctor.website);

        if (remainingDoctors.length > 0) {
            console.log("Continuing to process remaining doctors without websites...");
            // Call the function again to process remaining entries
            await addWebsites();
        } else {
            console.log("All doctors have websites. Process completed.");
        }
    }
}

// Export the function to be used in index.js
export { addWebsites };

这很奇怪:这个版本没有 Brigth Data 连接,工作得很好...我检查了 Bright 数据中的日志但它很好

javascript puppeteer
1个回答
0
投票

Bright Data Docs 上指出,如果您以编程方式导航而不是在网站中单击,则每个会话仅允许一次导航。 因此,如果您想通过使用 URL 而不是通过在网站中单击来抓取多个网站,则需要重新创建会话,一种方法是关闭浏览器并再次连接。例如,在您的代码中,您可以关闭浏览器并在导航 doctor 之前再次连接它

 ...
 // Use a new page for each doctor's profile
 await browser.close();
 browser = await puppeteer.connect(browserConfig);
 console.log('Connected to browser');
 const profilePage = await browser.newPage();
 await profilePage.goto
 ...
© www.soinside.com 2019 - 2024. All rights reserved.