我正在尝试使用 Selenium 进行网页抓取,我想从 Google 下载图像,但我有多个问题:
AttributeError: 'list' object has no attribute 'timeout'
。这是我的代码:
from urllib.parse import urlparse
from selenium import webdriver
import time as t
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time as t
import urllib
import base64
try:
os.mkdir("G:/Smokking_Project")
except:
pass
name="smoked"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
#driver = webdriver.Chrome(executable_path='chromedriver.exe',options=chrome_options)
driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 5)
strr="https://www.google.com/search?q=smokinng&tbm=isch&ved=2ahUKEwi8k9zn9eOBAxVtlycCHTa_DnUQ2-cCegQIABAA&oq=smokinng&gs_lcp=CgNpbWcQAzIJCAAQGBCABBAKMgkIABAYEIAEEAoyCQgAEBgQgAQQCjoECCMQJzoFCAAQgAQ6BggAEAUQHjoECAAQHjoICAAQgAQQsQM6BAgAEAM6BwgAEBgQgARQjwdY8xJg-RloAHAAeACAAb0BiAHsCZIBAzAuOZgBAKABAaoBC2d3cy13aXotaW1nwAEB&sclient=img&ei=uUwhZfzSFO2unsEPtv66qAc&bih=723&biw=1517&hl=en"
driver.get(strr)
t.sleep(3)
links=[]
x=1
last_height=0
def download_image(url,filename):
resource = urllib.request.urlopen(url)
output = open(filename,"wb")
output.write(resource.read())
output.close()
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
t.sleep(4)
#try:
img_link = wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[1]/div[1]/img')))
t.sleep(1)
for img in img_link:
url = img.get_attribute('src')
if url not in links:
links.append(url)
print (url)
try:
os.mkdir('G://Smokking_Project//'+name)
except:
pass
try:
os.mkdir('G://Smokking_Project//'+name)
except:
pass
file_name='Smokking_Project//'+name+'//'+str(x)+'.jpg'
download_image(img_link,file_name)
x+=1
#except:
#print('-',end='')
new_height = driver.execute_script("return document.body.scrollHeight")
print(new_height)
if new_height == last_height:
break
last_height = new_height
driver.close()`
以下是完整错误
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
c:\Users\Geka\Desktop\openCV\vision_ahmed_ibrahim\webscrapping\webscrapping_Google.ipynb Cell 6 line 6
62 pass
63 file_name='Smokking_Project//'+name+'//'+str(x)+'.jpg'
---> 64 download_image(img_link,file_name)
66 x+=1
67 #except:
68 #print('-',end='')
c:\Users\Geka\Desktop\openCV\vision_ahmed_ibrahim\webscrapping\webscrapping_Google.ipynb Cell 6 line 3
33 def download_image(url,filename):
---> 34 resource = urllib.request.urlopen(url)
35 output = open(filename,"wb")
36 output.write(resource.read())
File c:\Users\Geka\anaconda3\Lib\urllib\request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
214 else:
215 opener = _opener
--> 216 return opener.open(url, data, timeout)
File c:\Users\Geka\anaconda3\Lib\urllib\request.py:509, in OpenerDirector.open(self, fullurl, data, timeout)
506 if data is not None:
507 req.data = data
--> 509 req.timeout = timeout
510 protocol = req.type
512 # pre-process request
AttributeError: 'list' object has no attribute 'timeout'
我们在您的代码中发现的错误是
1)AttributeError: 'list' object has no attribute 'timeout': 发生此错误是因为您将元素列表 (img_link) 传递给 download_image 函数而不是单个 URL 字符串。您需要将 url 变量传递给函数。
2)处理Base64图像:要处理Base64图像,您需要解码Base64字符串并将其保存为图像文件。
3)下载包含图片URL的URL:您可以通过向URL发送HTTP请求并保存响应内容来下载URL中的图片。
让我们修改您的代码来解决这些问题:
import os
import time
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
try:
os.mkdir("G:/Smokking_Project")
except FileExistsError:
pass
name = "smoked"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 5)
strr = "https://www.google.com/search?q=smokinng&tbm=isch&ved=2ahUKEwi8k9zn9eOBAxVtlycCHTa_DnUQ2-cCegQIABAA&oq=smokinng&gs_lcp=CgNpbWcQAzIJCAAQGBCABBAKMgkIABAYEIAEEAoyCQgAEBgQgAQQCjoECCMQJzoFCAAQgAQ6BggAEAUQHjoECAAQHjoICAAQgAQQsQM6BAgAEAM6BwgAEBgQgARQjwdY8xJg-RloAHAAeACAAb0BiAHsCZIBAzAuOZgBAKABAaoBC2d3cy13aXotaW1nwAEB&sclient=img&ei=uUwhZfzSFO2unsEPtv66qAc&bih=723&biw=1517&hl=en"
driver.get(strr)
time.sleep(3)
x = 1
last_height = 0
def download_image(url, filename):
resource = urllib.request.urlopen(url)
output = open(filename, "wb")
output.write(resource.read())
output.close()
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(4)
img_links = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//a[1]/div[1]/img')))
time.sleep(1)
for img in img_links:
url = img.get_attribute('src')
if url:
if url.startswith('data:image'):
# Decode Base64 image and save it
img_data = url.split(',')[1]
img_data = img_data.encode()
filename = f'G:/Smokking_Project/{name}/{x}.jpg'
with open(filename, 'wb') as f:
f.write(base64.b64decode(img_data))
else:
# Download image from URL
try:
os.makedirs(f'G:/Smokking_Project/{name}', exist_ok=True)
except FileExistsError:
pass
filename = f'G:/Smokking_Project/{name}/{x}.jpg'
download_image(url, filename)
x += 1
new_height = driver.execute_script("return document.body.scrollHeight")
print(new_height)
if new_height == last_height:
break
last_height = new_height
driver.close()
此代码应该可以解决您提到的问题。它通过解码处理 Base64 图像并将其保存为图像文件。此外,它还通过发送 HTTP 请求并保存响应内容来从 URL 下载图像。