我尝试抓取 Instagram,例如我尝试抓取 Nike Instagram。但是,我只想得到图像的描述。标签内的描述图像。我试图展示它,但不起作用。
这是我的代码:
导入scrapy
class Nike(scrapy.Spider):
name = 'nike'
urls = 'http://www.instagram.com/nike/'
start_urls = [urls]
allowed_domains = ['http://instagram.com']
def parse(self, response):
for N in response.css('div._jjzlb'):
yield{
'name':N.css('alt::text').extract()
}
按照OP在评论中的要求,这里是一个使用js2xml直接从HTML源解析JavaScript数据的示例(免责声明:我是js2xml的作者):
$ scrapy version -v
Scrapy : 1.4.0
lxml : 3.7.3.0
libxml2 : 2.9.3
cssselect : 1.0.1
parsel : 1.2.0
w3lib : 1.17.0
Twisted : 17.1.0
Python : 2.7.12+ (default, Sep 17 2016, 12:08:02) - [GCC 6.2.0 20160914]
pyOpenSSL : 17.0.0 (OpenSSL 1.0.2g 1 Mar 2016)
Platform : Linux-4.8.0-53-generic-x86_64-with-Ubuntu-16.10-yakkety
$ scrapy shell http://www.instagram.com/nike/
2017-05-30 10:06:39 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
(...)
2017-05-30 10:06:40 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.instagram.com/nike/> from <GET http://www.instagram.com/nike/>
2017-05-30 10:06:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.instagram.com/nike/> (referer: None)
>>> response.xpath('//script[contains(., "sharedData")]')
[<Selector xpath='//script[contains(., "sharedData")]' data=u'<script type="text/javascript">window._s'>]
>>> response.xpath('string(//script[contains(., "sharedData")])').get()
u'window._sharedData = {"activity_counts": null, "config": (...redacted...) "show_app_install": true};'
>>> import js2xml
>>> # get the JavaScript statements
>>> js = response.xpath('string(//script[contains(., "sharedData")])').get()
>>> # parse them with js2xml
>>> jstree = js2xml.parse(js)
>>> # locate the object we're interested in:
>>> # here it's the right-part of the following assignement:
>>> # |--- this bit
>>> # window._sharedData = {"activity_counts":...
>>>
>>> obj = jstree.xpath('//assign[left//identifier[@name="_sharedData"]]/right/*')[0]
>>> # convert this object to a Python dict
>>> sharedData = js2xml.jsonlike.make_dict(obj)
>>>
>>> from pprint import pprint
>>> # quite a lot of data...
>>> pprint(sharedData)
{'activity_counts': None,
'config': {'csrf_token': 'JzQ8ja9YlG1cKqwAbvwacTKhOc4FuBDT', 'viewer': None},
'country_code': 'FR',
'display_properties_server_guess': {'pixel_ratio': 1.5,
'viewport_width': 360},
'entry_data': {'ProfilePage': [{'logging_page_id': 'profilePage_13460080',
'user': {'biography': 'Just Do It.',
'blocked_by_viewer': False,
'connected_fb_page': None,
'country_block': False,
'external_url': 'http://nike.com/justdoit',
'external_url_linkshimmed': 'http://l.instagram.com/?u=http%3A%2F%2Fnike.com%2Fjustdoit&e=ATOPqaPHEBPK4ylZ6YZRynSTQdp28oGedfFWw3gLtqoII1ZhtlDhLI0vu9Udbjg',
'followed_by': {'count': 72447697},
'followed_by_viewer': False,
'follows': {'count': 136},
'follows_viewer': False,
'full_name': 'nike',
'has_blocked_viewer': False,
'has_requested_viewer': False,
'id': '13460080',
'is_private': False,
'is_verified': True,
'media': {'count': 897,
'nodes': [{'__typename': 'GraphVideo',
'caption': 'Eliud Kipchoge - 2:00:25\nThe barrier just got that much closer.\n#Breaking2 #JustDoIt',
'code': 'BTvxgGEAn-p',
'comments': {'count': 2274},
'comments_disabled': False,
'date': 1494064175,
'dimensions': {'height': 640,
'width': 640},
'display_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/s640x640/e15/18380674_1840678496198968_334757937257906176_n.jpg',
'id': '1508642110004428713',
'is_video': True,
'likes': {'count': 296763},
'owner': {'id': '13460080'},
'thumbnail_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/s640x640/e15/18380674_1840678496198968_334757937257906176_n.jpg',
'video_views': 1193309},
(...redacted...)
{'__typename': 'GraphVideo',
'caption': u'We step onto the court as equals. Judged only by our performance. New Orleans sets the stage for All Star Weekend @nikebasketball.\xa0#EQUALITY #nike',
'code': 'BQlZFdugTIG',
'comments': {'count': 1172},
'comments_disabled': False,
'date': 1487273379,
'dimensions': {'height': 360,
'width': 640},
'display_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/s640x640/e15/16789854_771546756335957_8222167172487577600_n.jpg',
'id': '1451676781575746054',
'is_video': True,
'likes': {'count': 184595},
'owner': {'id': '13460080'},
'thumbnail_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/e15/c157.0.406.406/16789854_771546756335957_8222167172487577600_n.jpg',
'video_views': 627647}],
'page_info': {'end_cursor': 'AQCseCeUI_vJ_XzpTTIf7AhmwqbkBSf959347Wl03MM_ErpUQqP6RGjuJRdEcH1qJBCWvPGxUhpm6rPv5esAW5GvXHGodwmYA0eT51wnk8_-KA',
'has_next_page': True}},
'profile_pic_url': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-19/s150x150/17126848_1779368432381854_3589478532054515712_a.jpg',
'profile_pic_url_hd': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-19/s320x320/17126848_1779368432381854_3589478532054515712_a.jpg',
'requested_by_viewer': False,
'username': 'nike'}}]},
'environment_switcher_visible_server_guess': True,
'gatekeepers': {'bn': True, 'ld': True, 'pl': True},
'hostname': 'www.instagram.com',
'language_code': 'en',
'platform': 'web',
'probably_has_app': False,
'qe': {'bc3l': {'g': '', 'p': {}},
'create': {'g': '', 'p': {}},
'deact': {'g': '', 'p': {}},
'disc': {'g': '', 'p': {}},
'ebd': {'g': 'holdout', 'p': {'use_new_styles': 'false'}},
'feed': {'g': '', 'p': {}},
'gql': {'g': '', 'p': {}},
'nav': {'g': '', 'p': {}},
'nav_lo': {'g': '', 'p': {}},
'pm': {'g': '', 'p': {}},
'poe': {'g': '', 'p': {}},
'profile': {'g': '', 'p': {}},
'sidecar': {'g': '', 'p': {}},
'su_universe': {'g': '', 'p': {}},
'ufi': {'g': '', 'p': {}},
'ufi_loggedout': {'g': '', 'p': {}},
'us': {'g': '', 'p': {}},
'us_li': {'g': '', 'p': {}},
'video': {'g': '', 'p': {}}},
'show_app_install': True}
>>> # nodes have a 'caption' key that looks like the "alt" attributes
>>> pprint(sharedData['entry_data']['ProfilePage'][0]['user']['media']['nodes'])
[{'__typename': 'GraphVideo',
'caption': 'Eliud Kipchoge - 2:00:25\nThe barrier just got that much closer.\n#Breaking2 #JustDoIt',
'code': 'BTvxgGEAn-p',
'comments': {'count': 2274},
'comments_disabled': False,
'date': 1494064175,
'dimensions': {'height': 640, 'width': 640},
'display_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/s640x640/e15/18380674_1840678496198968_334757937257906176_n.jpg',
'id': '1508642110004428713',
'is_video': True,
'likes': {'count': 296763},
'owner': {'id': '13460080'},
'thumbnail_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/s640x640/e15/18380674_1840678496198968_334757937257906176_n.jpg',
'video_views': 1193309},
{'__typename': 'GraphImage',
'caption': 'Eliud Kipchoge - 2:00:25\nThe barrier just got that much closer.\n#Breaking2 #JustDoIt',
'code': 'BTvj_-gg6B6',
'comments': {'count': 2269},
'comments_disabled': False,
'date': 1494057096,
'dimensions': {'height': 1080, 'width': 1080},
'display_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/e35/18299402_128309274385168_1749644873230712832_n.jpg',
'id': '1508582728264818810',
'is_video': False,
'likes': {'count': 484745},
'owner': {'id': '13460080'},
'thumbnail_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/s640x640/sh0.08/e35/18299402_128309274385168_1749644873230712832_n.jpg'},
(...redacted...)
{'__typename': 'GraphVideo',
'caption': u'We step onto the court as equals. Judged only by our performance. New Orleans sets the stage for All Star Weekend @nikebasketball.\xa0#EQUALITY #nike',
'code': 'BQlZFdugTIG',
'comments': {'count': 1172},
'comments_disabled': False,
'date': 1487273379,
'dimensions': {'height': 360, 'width': 640},
'display_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/s640x640/e15/16789854_771546756335957_8222167172487577600_n.jpg',
'id': '1451676781575746054',
'is_video': True,
'likes': {'count': 184595},
'owner': {'id': '13460080'},
'thumbnail_src': 'https://scontent-cdg2-1.cdninstagram.com/t51.2885-15/e15/c157.0.406.406/16789854_771546756335957_8222167172487577600_n.jpg',
'video_views': 627647}]
>>>