我的Json医生姓名、个人资料页面列表
{
{
"name": "Herr Prof. Dr. med. Armin Quentmeyer",
"link": "https://www.doctolib.de/orthopadie/ludwigshafen-am-rhein/dr-prof-armin-quentmeyer?pid=practice-238444"
},
{
"name": "Herr Dr. med. Ralph Hower",
"link": "https://www.doctolib.de/orthopadie/murnau-am-staffelsee/ralph-hower?pid=practice-228573"
},
{
"name": "Orthopädische Praxis Ludwigsburg",
"link": "https://www.doctolib.de/praxis/ludwigsburg/orthopaedische-praxis-ludwigsburg?pid=practice-570039"
}
}
// website_adder.js
import puppeteer from "puppeteer-core";
import fs from 'fs';
async function addWebsites() {
let browser;
try {
console.log("Starting function");
const auth = '***';
browser = await puppeteer.connect({
browserWSEndpoint: `wss://${auth}@***`
});
// Read the JSON file with the list of doctors
const data = fs.readFileSync('uniqueDoctorsData.json', 'utf-8');
const doctorsData = JSON.parse(data);
// Iterate through each doctor, visit their profile, and add the website to their data
for (let doctor of doctorsData) {
console.log(`Visiting profile of ${doctor.name} at ${doctor.link}`);
try {
// Introduce a short delay between navigations to avoid hitting the page limit
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 10 seconds
console.log("Waited for 2000ms");
// Use a new page for each doctor's profile
const profilePage = await browser.newPage();
await profilePage.goto(doctor.link, { waitUntil: 'networkidle2' });
// Extract the doctor's website from their profile page
const website = await profilePage.evaluate(() => {
const websiteElement = document.querySelector('a[rel="nofollow"][target="_blank"]');
return websiteElement ? websiteElement.href : null;
});
// Add the website to the doctor's data
doctor.website = website;
console.log(`Extracted website for ${doctor.name}: ${website}`);
// Close the profile page to free up resources
await profilePage.close();
// Save the updated doctors data back to the JSON file immediately after processing each doctor
fs.writeFileSync('uniqueDoctorsData.json', JSON.stringify(doctorsData, null, 2), 'utf-8');
console.log(`Saved updated data for ${doctor.name} to JSON file`);
} catch (error) {
console.error(`Failed to navigate to ${doctor.link}`, error);
}
}
console.log('All doctor profiles processed and data saved to updatedDoctorsData.json');
} catch (e) {
console.error('Failed to add websites', e);
} finally {
if (browser) {
await browser.close();
}
}
}
// Export the function to be used in index.js
export { addWebsites };
我有一个简单的函数,可以根据一个 json 文件从给定的网站中提取网站,在该文件中我收集了医生的姓名和个人资料页面。但无论我尝试什么,我总会得到回报
Page.navigate limit reached
错误。在谷歌和这里我在其他地方没有发现这个错误。非常感谢帮助!!
我尝试在没有循环和递归循环的情况下一页一页地调用页面。当我第一次访问循环之前的页面然后在循环内部时,我可以连续访问 2 个页面...
编辑最小可重现示例:
// website_adder.js
import puppeteer from "puppeteer";
import fs from 'fs';
async function addWebsites() {
let browser;
try {
console.log("Starting function");
// Launch a new browser instance (no proxy connection)
browser = await puppeteer.launch({
headless: true, // Run in headless mode for efficiency
args: ['--no-sandbox', '--disable-setuid-sandbox'] // Additional arguments to improve performance/stability
});
// Read the JSON file with the list of doctors
const data = fs.readFileSync('uniqueDoctorsData.json', 'utf-8');
const doctorsData = JSON.parse(data);
// Iterate through each doctor, visit their profile, and add the website to their data
for (let doctor of doctorsData) {
// Skip doctors who already have a website to avoid unnecessary work
if (doctor.website) {
console.log(`Skipping ${doctor.name} as website is already added.`);
continue;
}
console.log(`Visiting profile of ${doctor.name} at ${doctor.link}`);
try {
// Introduce a short delay between navigations to avoid hitting the page limit
await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds
console.log("Waited for 2000ms");
// Use a new page for each doctor's profile
const profilePage = await browser.newPage();
await profilePage.goto(doctor.link, { waitUntil: 'networkidle2' });
// Extract the doctor's website from their profile page
const website = await profilePage.evaluate(() => {
const websiteElement = document.querySelector('a[rel="nofollow"][target="_blank"]');
return websiteElement ? websiteElement.href : null;
});
// Add the website to the doctor's data
doctor.website = website;
console.log(`Extracted website for ${doctor.name}: ${website}`);
// Close the profile page to free up resources
await profilePage.close();
// Save the updated doctors data back to the JSON file immediately after processing each doctor
fs.writeFileSync('uniqueDoctorsData.json', JSON.stringify(doctorsData, null, 2), 'utf-8');
console.log(`Saved updated data for ${doctor.name} to JSON file`);
} catch (error) {
console.error(`Failed to navigate to ${doctor.link}`, error);
}
}
console.log('All doctor profiles processed and data saved to uniqueDoctorsData.json');
} catch (e) {
console.error('Failed to add websites', e);
} finally {
if (browser) {
await browser.close();
}
// After processing the current batch, check again if there are any entries left without websites
const updatedData = fs.readFileSync('uniqueDoctorsData.json', 'utf-8');
const updatedDoctorsData = JSON.parse(updatedData);
const remainingDoctors = updatedDoctorsData.filter(doctor => !doctor.website);
if (remainingDoctors.length > 0) {
console.log("Continuing to process remaining doctors without websites...");
// Call the function again to process remaining entries
await addWebsites();
} else {
console.log("All doctors have websites. Process completed.");
}
}
}
// Export the function to be used in index.js
export { addWebsites };
这很奇怪:这个版本没有 Brigth Data 连接,工作得很好...我检查了 Bright 数据中的日志但它很好
Bright Data Docs 上指出,如果您以编程方式导航而不是在网站中单击,则每个会话仅允许一次导航。 因此,如果您想通过使用 URL 而不是通过在网站中单击来抓取多个网站,则需要重新创建会话,一种方法是关闭浏览器并再次连接。例如,在您的代码中,您可以关闭浏览器并在导航 doctor 之前再次连接它
...
// Use a new page for each doctor's profile
await browser.close();
browser = await puppeteer.connect(browserConfig);
console.log('Connected to browser');
const profilePage = await browser.newPage();
await profilePage.goto
...