如何将网页刮成与Pandas Dataframe兼容的字典?

问题描述 投票:1回答:2
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})
g_price = soup.find_all("div",{"class" : "item-price-container"})
g_star = soup.find_all("div",{"class" : "stars stars-small tile-row"})

data=defaultdict(list)
for product_title in g_data:
    a_product_title = product_title.find_all("a","js-product-title")
    for text_product_title in a_product_title : 
       data['Product Title'].append(textroduct_title.text)  

for row in g_price:
    price = row.find('span', 'price price-display').text.strip()
    data['Price'].append(price)

for allstar in g_star:
    star=allstar.find('span','visuallyhidden').text.stp()
    data['Stars'].append(star)

dd_starring = soup.find_all('dd', {"class" : "media-details-artist-dd module"})
for dd in dd_starring :
     actors = dd.text
 #data['Actors'].append(actors)

df = pd.DataFrame(data)
df  


如果我添加尝试使用行data['Stars'].append(star)追加它 - 我得到以下错误 -

ValueError:数组必须都是相同的长度

附加它应该做什么,没有星的行应该有NA。

有什么建议?请帮忙

html pandas dataframe
2个回答
2
投票

您不需要构建单独的内容列表来循环。您可以遍历g_data,这意味着您将不会有不同长度的结果集。

from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})

data=defaultdict(list)
for content in g_data:
    title = content.find("a","js-product-title")
    data['Product Title'].append(title.text)

    try:
        stars =content.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
        data['Stars'].append(stars)

    except:
        data['Stars'].append(None)

    price = content.find('span', 'price price-display').text.strip()
    data['Price'].append(price)



 #data['Actors'].append(actors)

df = pd.DataFrame(data)
df  

据我所知,内部循环也没有必要,因为每个项目只有一个名称,价格和评级。


0
投票

您的原始问题是由于您的个别循环不包含您循环的每个元素的相同数量(即 - 15星,第20节价格)。避免此类问题的最佳方法是首先使用一个循环,然后对每个要删除的项应用try&except值。这样,如果您想要的项目不断存在任何问题,您仍然可以收集存在的内容。

from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)

g_data = soup.find_all("div", {"class" : "tile-content"})

data=defaultdict(list)

#One loop to rule them all
for tile in g_data:
    #the "tile" value in g_data contains what you are looking for...
    #find the product titles
    try:
        title = tile.find("a","js-product-title")
        data['Product Title'].append(title.text)
    except:
        data['Product Title'].append("")

    #find the prices
    try:
        price = tile.find('span', 'price price-display').text.strip()
        data['Price'].append(price)  
    except:
        data['Price'].append("")

    #find the stars
    try:
        g_star = tile.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
        data['Stars'].append(g_star)
    except:
        data['Stars'].append("")

df = pd.DataFrame(data)
© www.soinside.com 2019 - 2024. All rights reserved.