所以当我从一个我正在创建的类中运行以下代码来抓取Craigslist.org时,我一直得到一个socket.error 61。我尝试过各种版本的Chromedriver和PhantomJS,但似乎无法让它消失。起初我以为是我的IP被标记,所以我通过代理轮换,但这没有帮助。我确信它很简单,但我似乎无法弄清楚它是什么。任何帮助将非常感激!
def __init__(self):
self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
self.options = webdriver.ChromeOptions()
self.options.add_argument('headless')
self.options.add_argument('--proxy-server=http://12.221.240.25:8080')
self.options.add_argument('user-agent={self.user_agent}')
self.current_region = ''
self.driver = webdriver.Chrome()
self.driver.get('https://craigslist.org')
self.proxy_list = ['208.95.62.81:3128', '208.95.62.80:3128', '159.203.181.50:3128', '35.196.26.166:3128']
def scrape_test(self):
self.scraper_wait(self.driver, '//*[@id="rightbar"]')
rightbar = self.driver.find_element_by_xpath('//*[@id="rightbar"]')
nearby_cl = rightbar.find_element_by_xpath('//*[@id="rightbar"]/ul/li[1]')
while True:
child_items = nearby_cl.find_elements_by_class_name('s')
random = randint(1, len(child_items))
try:
time.sleep(10)
print("Clicking {}".format(child_items[random].text))
child_items[random].click()
housing = self.driver.find_element_by_xpath('//*[@id="hhh"]/h4/a')
housing.click()
self.driver.back()
time.sleep(5)
except WebDriverException:
continue
except Exception as e:
print(e.message)
return
finally:
self.driver.quit()
堆栈跟踪也如下:
File "scraper.py", line 131, in <module>
cl.scrape_test()
File "scraper.py", line 81, in scrape_test
child_items = nearby_cl.find_elements_by_class_name('s')
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/selenium/webdriver/remote/webelement.py", line 299, in find_elements_by_class_name
return self.find_elements(by=By.CLASS_NAME, value=name)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/selenium/webdriver/remote/webelement.py", line 527, in find_elements
{"using": by, "value": value})['value']
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/selenium/webdriver/remote/webelement.py", line 493, in _execute
return self._parent.execute(command, params)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 250, in execute
response = self.command_executor.execute(driver_command, params)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 464, in execute
return self._request(command_info[0], url, body=data)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 487, in _request
self._conn.request(method, parsed_url.path, body, headers)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1057, in request
self._send_request(method, url, body, headers)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1097, in _send_request
self.endheaders(body)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1053, in endheaders
self._send_output(message_body)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 897, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 859, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 836, in connect
self.timeout, self.source_address)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 575, in create_connection
raise err
socket.error: [Errno 61] Connection refused
在你完成使用之前,你将在第一次通过while
循环结束时拆除驱动程序。
相反,将调用driver.quit()
移动到您确定使用驱动程序完成的某个地方,例如:
def scrape_test(self):
try:
# ...
while True:
# ...
finally:
self.driver.quit()