我想从同一个URL的多个页面中提取数据到一个csv文件中。)
这个脚本的工作方式是将url写进一个url.txt文件,脚本将从这个文件中读取数据,然后保存到csv文件中。
我是想自己想办法,但我需要社会上聪明人的帮助,请大家帮我一把
正如你在代码中看到的,我试图从kakaku.com(jp网站)抓取数据。
'''
import os
import sys
import csv
import codecs
import requests
from bs4 import BeautifulSoup
# scraping function for kakatu.com / old version
def kakaku_scraper_o(url):
for u in url:
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
page = requests.get(u, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
titles_temp = soup.find_all(class_ = "ckitemLink")
prices_temp = soup.find_all(class_ = "pryen")
links_temp = soup.find_all(class_ = "ckitanker")
titles = []
prices = []
links = []
for i in range(len(titles_temp)):
links.append(links_temp[i]['href'])
titles.append(titles_temp[i].get_text())
prices.append(prices_temp[i].get_text())
filename = u.split("/")
filename = filename[-2] + "_kakaku.csv"
with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Link", "Title", "Price"])
for i in range(len(titles)):
csvWriter.writerow([links[i], titles[i].encode("utf8"), prices[i].encode("utf8")])
# scraping function for kakatu.com / new version
def kakaku_scraper_n(url):
for u in url:
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
page = requests.get(u, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
titles_temp = soup.find_all(class_ = "p-list_name")
prices_temp = soup.find_all(class_ = "p-list_price_data_price_num-1 p-num")
links_temp = soup.find_all(class_ = 'p-list_name')
titles = []
prices = []
links = []
for i in range(len(titles_temp)):
links_temp[i] = links_temp[i].find("a")
links.append("https://kakaku.com" + str(links_temp[i]['href']))
titles.append(titles_temp[i].get_text())
prices.append(prices_temp[i].get_text())
filename = u.split("/")
filename = filename[-2] + "_kakaku.csv"
with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Link", "Title", "Price"])
for i in range(len(titles)):
csvWriter.writerow([links[i], titles[i], prices[i]])
# scraping fuction for bestgate.net
def bestgate_scraper(url):
for u in url:
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
page = requests.get(u, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')
titles_temp = soup.find_all(class_ = "name")
prices_temp = soup.find_all(class_ = "price")
links_temp = soup.find_all(class_ = 'name')
titles = []
prices = []
links = []
for i in range(len(titles_temp)):
links.append(links_temp[i]['href'])
titles.append(titles_temp[i].get_text())
prices.append(prices_temp[i].get_text())
filename = u.split("/")
filename = filename[-2] + "_bestgate.csv"`enter code here`
with open(filename, 'w', encoding="utf-8", newline='') as csvFile:
csvWriter = csv.writer(csvFile)
csvWriter.writerow(["Link", "Title", "Price"])
for i in range(len(titles)):
csvWriter.writerow([links[i], titles[i], prices[i]])
# main function
if __name__ == '__main__':
with open("url.txt", mode='r', newline='') as urlfile:
url = urlfile.read().splitlines()
print(url)
urlfile.close()
# sort out the links for each website's function
kko = []
kkn = []
btg = []
for u in url:
if not "aspx" in u:
if "kakaku" in u:
kkn.append(u)
elif "aspx" and "kakaku" in u:
kko.append(u)
else:
btg.append(u)
bestgate_scraper(btg)
kakaku_scraper_o(kko)
kakaku_scraper_n(kkn)
'''
我不完全理解你的问题,但我要提出以下意见。
如果你更新你的问题,我可能会帮助你。