对于一个研究项目,我想从国际足联网站上抓取国际足球比赛的所有结果。我正在使用 R 来执行此操作。然而,包含匹配项的表似乎是使用 javascript 生成的。 这是我想抓取的网址:
http://www.fifa.com/live-scores/international-tournaments/fixtures-results/index.html#month5-2018
我尝试在 javascript 表渲染后使用 phantomjs 渲染页面,但在生成的 html 中,仍然没有给出包含匹配结果的表。这是我的代码:
url = "http://www.fifa.com/live-scores/international-
tournaments/fixtures-results/index.html#month5-2018"
writeLines(sprintf("
var page = require('webpage').create();
var fs = require('fs');
var path = 'scrape.html'
page.open('%s', function (status) {
var content = page.content;
fs.write(path, content, 'w')
phantom.exit();
});", url), con="scrape.js")
system("./phantomjs.exe scrape.js")
表构建完成后,您无需对其进行爬网,该网站会对此类端点进行一些调用。
http://data.fifa.com/livescores/en/internationaltournaments/matches/m/byyearandmonth/2018/5
http://data.fifa.com/livescores/live/matches
要找到它们,请使用浏览器上的网络检查器(按 f12)。更简单的方法是选择构建这些表的 json,而不是在构建后选择表。
编辑:所有构建表的数据都在这些json上,以获取数据。首先执行 get 请求并下载包含这些 json 的网页内容。当您检查网页内容时,您会看到它们是 json,但它们位于函数内,只需将其删除即可。
例如,在第一个链接中,您可以删除
_matchesByYearAndMonthCallback(
和最后一个 )
,它们会转义 json。
删除后,您将获得一个有效的 json,您可以使用包 json.lite 或 json 在 R 中解析它,请检查文档。使用这些软件包之一后,您应该获得一个数据框,您可以选择信息。
您将获得的 json 开头的示例。
{
"competitionslist": {
"0": {
"name": "Friendlies",
"idCup": 506,
"edition": 1872,
"idCupSeason": 2000010101,
"isFifaCompetition": true,
"countryCode": "",
"cupKindID": 105,
"competitionSeoName": "friendly-506",
"hasStanding": false,
"linkMatches": "",
"linkStanding": "",
"link": "",
"hasMatchLive": false,
"isActiveSeason": true,
"matchlist": [{
"idCup": 506,
"idCupSeason": 2000010101,
"edition": 1872,
"isLive": false,
"isActiveSeason": true,
"isFifaCompetition": true,
"isClubCompetition": false,
"competitionName": "Friendlies",
"providerCompetitionID": 0,
"providerEditionID": 0,
"idMatch": 300438343,
"internalMatchID": 0,
"idRound": 281863,
"idHomeTeam": 43818,
"homeCountryCode": "IRQ",
"homeTeamName": "Iraq",
"idAwayTeam": 43989,
"awayCountryCode": "PLE",
"awayTeamName": "Palestine",
"matchDate": "2018-05-08T16:00:00Z",
"matchDateUTC": "2018-05-08T16:00:00Z",
"kickOffTime": "16:00",
"minute": 0,
"status": 0,
"cupKindID": 105,
"cupKindName": "Friendly",
"hasLineup": false,
"scoreHome": 0,
"scoreAway": 0,
"venueName": "Basra ",
"competitionSeoName": "friendly-506",
"matchSeoName": "Iraq-Palestine-300438343",
"homeTeamSeoName": "iraq-43818",
"awayTeamSeoName": "palestine-43989",
"hasStanding": false,
"winTeamName": "",
"winTeamShortName": "",
"isStarted": true,
"isFinished": true,
"isAwarded": false,
"isPostponed": false,
"isSuspended": false,
"isAbandoned": false,
"link": "",
"isNextDay": false
}, {
"idCup": 506,
"idCupSeason": 2000010101,
"edition": 1872,
"isLive": false,
"isActiveSeason": true,
"isFifaCompetition": true,
"isClubCompetition": false,
"competitionName": "Friendlies",
"providerCompetitionID": 0,
"providerEditionID": 0,
"idMatch": 300439349,
"internalMatchID": 0,
"idRound": 281863,
"idHomeTeam": 43843,
"homeCountryCode": "ALG",
"homeTeamName": "Algeria",
"idAwayTeam": 43835,
"awayCountryCode": "KSA",
"awayTeamName": "Saudi Arabia",
"matchDate": "2018-05-09T19:30:00Z",
"minute": 0,
"status": 0,
"cupKindID": 105,
"cupKindName": "Friendly",
"hasLineup": false,
"scoreHome": 0,
"scoreAway": 2,
"venueName": "Cadiz ",
"idWinTeam": 43835,
"competitionSeoName": "friendly-506",
"matchSeoName": "Algeria-Saudi Arabia-300439349",
"homeTeamSeoName": "algeria-43843",
"awayTeamSeoName": "saudi-arabia-43835",
"hasStanding": false,
"winTeamName": "Saudi Arabia",
"winTeamShortName": "Saudi Arabia",
"isStarted": true,
"isFinished": true,
"isAwarded": false,
"isPostponed": false,
"isSuspended": false,
"isAbandoned": false,
"link": "",
"isNextDay": false
},