问题描述:
我正在尝试自动化一个流程,我可以访问一个网站,将鼠标悬停在菜单导航栏上,然后单击第 1 层下拉列表中的每个导航类别选项,访问该页面并抓取该页面上前 20 个产品的产品详细信息并将其放在在 Excel 文件中。如果该页面不包含任何产品,脚本将继续向下滚动,直到到达页面末尾,如果没有找到产品 div,它将返回到页面顶部,然后单击页面中的下一个类别。导航面板
函数定义:
scroll_and_click_view_more函数用于向下滚动页面,prod_vitals函数用于抓取每个页面特定的产品详细信息,prod_count函数用于提取每个页面上的产品总数并创建所有页面的摘要。
错误描述:
由于该网站是日语的,因此我想在打开每个页面时将其翻译为英语,然后进行抓取。我编写了一个函数 translate_page 来翻译页面,并在每次从 scrape 函数打开新页面时调用此函数。代码按预期工作正常,唯一的问题是我仍然得到日语而不是英语的所有结果。
下面是我在 Excel 文件中输出的屏幕截图。所有导航选项卡名称和产品名称均为日语。我希望在从网站上抓取之前将它们翻译成英文
我在下面附加了“WebScraper”类,我在其中编写了翻译函数。为了清楚起见,我现在删除了其余的函数定义。如果需要的话稍后会添加。
class WebScraper:
def __init__(self):
self.url = "https://staging1-japan.coach.com/?auto=true"
#self.driver = webdriver.Chrome()
#options = Options()
#options.add_argument("--lang=en")
#self.driver = webdriver.Chrome(service=Service(r"c:\Users\DELL\Documents\Self_Project\chromedriver.exe"), options=options)
options = Options()
options.add_argument("--remote-debugging-port=9222")
self.driver = webdriver.Chrome(service=Service(r"c:\Users\DELL\Documents\Self_Project\chromedriver.exe"), options=options)
def translate_page(self):
script = """
var meta = document.createElement('meta');
meta.name = 'google';
meta.content = 'notranslate';
document.getElementsByTagName('head')[0].appendChild(meta);
"""
self.driver.execute_script(script)
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
"source": """
Object.defineProperty(navigator, 'languages', {
get: function() { return ['en-US', 'en']; }
});
"""
})
def scrape(self):
self.driver.get(self.url)
#self.driver.maximize_window()
time.sleep(5)
nav_count = 0
mainWindow = self.driver.window_handles[0]
while True:
try:
self.driver.switch_to.window(mainWindow)
self.driver.execute_script("window.scrollBy(0, 100);")
soup = BeautifulSoup(self.driver.page_source, 'html.parser') # Refresh the page source and parse it
links = soup.find('div', {'class': 'css-wnawyw'}).find_all('a', {'class': 'css-ipxypz'})
hrefs = [link.get('href') for link in links]
if nav_count < len(hrefs): # Check if nav_count is within the range of hrefs
href = hrefs[nav_count]
time.sleep(2)
element1 = WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, f'a[href="{href}"]')))
self.driver.execute_script("window.scrollTo(0, arguments[0].getBoundingClientRect().top + window.scrollY - 100);", element1)
time.sleep(5)
self.driver.execute_script(f"window.open('{href}', '_blank');")
time.sleep(3)
newTab = self.driver.window_handles[-1]
self.driver.switch_to.window(newTab)
time.sleep(3)
self.translate_page() # Translate the new page
response = scroll_and_click_view_more(self.driver, href)
time.sleep(3)
if response != "No product tiles found" and response != "Reached the end of the page.":
soup = BeautifulSoup(response, 'html.parser')
PLP_title = links[nav_count].get('title')
prod_vitals(soup, PLP_title, self.url)
time.sleep(5)
prod_count(soup, PLP_title)
self.driver.execute_script("window.scrollBy(0, -500);")
time.sleep(2)
else:
self.driver.execute_script("window.scrollTo(0,0);")
time.sleep(3)
self.driver.close()
continue
else:
break
except TimeoutException:
print(f"Element with href {href} not clickable")
self.driver.save_screenshot('timeout_exception.png')
except Exception as e:
print(f"An error occurred: {e}")
finally:
nav_count += 1
self.driver.close()
scraper = WebScraper()
scraper.scrape()
time.sleep(5)
scraper.driver.quit()
以下代码修改对我有用。
def __init__(self):
self.url = "https://staging1-japan.coach.com/?auto=true"
options = Options()
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--lang=en")
prefs = {
"translate_whitelists": {"ja": "en"},
"translate": {"enabled": "true"}
}
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(service=Service(r"c:\Users\DELL\Documents\Self_Project\chromedriver.exe"), options=options)
def translate_page(self):
script = """
var meta = document.createElement('meta');
meta.name = 'google';
meta.content = 'notranslate';
document.getElementsByTagName('head')[0].appendChild(meta);
"""
self.driver.execute_script(script)
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
"source": """
Object.defineProperty(navigator, 'languages', {
get: function() { return ['en-US', 'en']; }
});
"""
})