试图从网站上删除数据但不断出错

问题描述 投票:0回答:1

好吧,基本上我要做的就是遍历每一支在过去6年中参加过NCAA男子篮球锦标赛的球队,并将他们的名单从this website中剔除。这是我的代码:

for (i in c(1:length(team_performance$Team))){
  burner_mpg_link <- getURL(paste("https://basketball.realgm.com/ncaa/conferences/",gsub(team_performance$Conference[i]," ","-"),
  "/16/",gsub(team_performance$Team[i]," ","-"),"/",team_performance$Number[i],"/stats/",team_performance$Year[i],
  "/Averages/All/All/Season/All/minutes/desc/1/",sep = ""))

  webpage <- read_html(burner_mpg_link)

  tables <- readHTMLTable(burner_mpg_link)
  table1 <- html_table(tables[1], fill = TRUE)
  data.frame(table1)

  temp_data$NULL.Year <- NULL
  temp_data$NULL.Year <- x

  mpg_data <- rbind(mpg_data,temp_data)

  percent_complete <- (100/384)*i
  print(paste(round(percent_complete,digits = 2),"% complete",sep=""))
}

team_performance是一个数据框,在程序的这一点上,包含团队名称,年份,锦标赛中的种子,高级回合,会议以及与该团队对应的网站的网址中的数字。我的问题是table1 <- html_table(tables[1], fill = TRUE)告诉我,html_table不起作用,因为tables[1]是一个列表。好的,我明白了。然后,当我尝试使用unlist取消tables[1]时,它会给出Error in attributes(.Data) <- c(attributes(.Data), attrib) 'names' attribute [345] must be the same length as the vector [23]。知道我能做些什么来解决这个问题吗?

编辑:可重复的例子。

> head(team_performance)
                 Team Year Seed Rounds.Advanced              Conference Number
1               Akron 2013   12               1 Mid-American Conference    174
2             Alabama 2012    9               1 Southeastern Conference    253
3              Albany 2015   14               1 America East Conference      6
4              Albany 2014   16               1 America East Conference      6
5              Albany 2013   15               1 America East Conference      6
6 American University 2014   15               1         Patriot League     245

我希望抓取的数据是数据框中the given website上的玩家统计表。

编辑2:

> dput(head(team_performance))
structure(list(Team = structure(c(1L, 2L, 3L, 3L, 3L, 4L), .Label = c("Akron", 
"Alabama", "Albany", "American University", "Arizona", "Arizona State", 
"Arkansas", "Austin Peay", "Baylor", "Belmont", "Brigham Young", 
"Bucknell", "Buffalo", "Butler", "Cal Poly", "Cal State Bakersfield", 
"California", "Chattanooga", "Cincinnati", "Coastal Carolina", 
"Colorado", "Colorado State", "Connecticut", "Creighton", "Davidson", 
"Dayton", "Delaware", "Detroit-Mercy", "Duke", "East Tennessee State", 
"Eastern Kentucky", "Eastern Washington", "Florida", "Florida Gulf Coast", 
"Florida State", "Fresno State", "George Washington", "Georgetown", 
"Georgia", "Georgia State", "Gonzaga", "Green Bay", "Hampton", 
"Harvard", "Hawaii", "Holy Cross", "Illinois", "Indiana", "Iona", 
"Iowa", "Iowa State", "Jacksonville State", "James Madison", 
"Kansas", "Kansas State", "Kent State", "Kentucky", "La Salle", 
"Lafayette", "Lehigh", "Little Rock", "Long Beach State", "Long Island", 
"Louisville", "Loyola (MD)", "LSU", "Manhattan", "Marquette", 
"Maryland", "Massachusetts", "Memphis", "Mercer", "Miami (FL)", 
"Michigan", "Michigan State", "Middle Tennessee State", "Milwaukee", 
"Minnesota", "Missouri", "Montana", "Mount St. Mary's", "Murray State", 
"NC State", "Nebraska", "Nevada", "New Mexico", "New Mexico State", 
"Norfolk State", "North Carolina", "North Carolina A&T", "North Carolina Central", 
"North Dakota", "North Dakota State", "Northeastern", "Northern Iowa", 
"Northern Kentucky", "Northwestern", "Northwestern State", "Notre Dame", 
"Ohio", "Ohio State", "Oklahoma", "Oklahoma State", "Ole Miss", 
"Oregon", "Oregon State", "Pacific", "Pittsburgh", "Princeton", 
"Providence", "Purdue", "Rhode Island", "Robert Morris", "Saint Joseph's", 
"Saint Louis", "Saint Mary's", "San Diego State", "Seton Hall", 
"South Carolina", "South Dakota State", "South Florida", "Southern", 
"Southern Methodist", "Southern Mississippi", "St. Bonaventure", 
"St. John's", "Stanford", "Stephen F. Austin", "Stony Brook", 
"Syracuse", "Temple", "Tennessee", "Texas", "Texas A&M", "Texas Southern", 
"Texas Tech", "Troy", "Tulsa", "UAB", "UC Davis", "UC Irvine", 
"UCLA", "UNC Asheville", "UNC Wilmington", "UNLV", "USC", "Utah", 
"Valparaiso", "Vanderbilt", "VCU", "Vermont", "Villanova", "Virginia", 
"Virginia Tech", "Weber State", "West Virginia", "Western Kentucky", 
"Western Michigan", "Wichita State", "Winthrop", "Wisconsin", 
"Wofford", "Wyoming", "Xavier", "Yale"), class = "factor"), Year = c(2013L, 
2012L, 2015L, 2014L, 2013L, 2014L), Seed = c(12L, 9L, 14L, 16L, 
15L, 15L), Rounds.Advanced = c(1L, 1L, 1L, 1L, 1L, 1L), Conference = structure(c(17L, 
25L, 1L, 1L, 1L, 24L), .Label = c("America East Conference", 
"American Athletic Conference", "Atlantic 10 Conference", "Atlantic Coast Conference", 
"Atlantic Sun Conference", "Big 12 Conference", "Big East Conference", 
"Big Sky Conference", "Big South Conference", "Big Ten Conference", 
"Big West Conference", "Colonial Athletic Association ", "ConferenceUSA ", 
"Horizon League ", "Ivy League ", "Metro Atlantic Athletic Conference", 
"Mid-American Conference", "Mid-Eastern Athletic Conference", 
"Missouri Valley Conference", "Mountain West Conference", "Northeast Conference", 
"Ohio Valley Conference", "Pacific-12 Conference", "Patriot League ", 
"Southeastern Conference", "Southern Conference", "Southland Conference", 
"Southwestern Athletic Conference", "Sun Belt Conference", "The Summit League ", 
"West Coast Conference", "Western Athletic Conference"), class = "factor"), 
Number = c(174L, 253L, 6L, 6L, 6L, 245L)), .Names = c("Team", 
"Year", "Seed", "Rounds.Advanced", "Conference", "Number"), row.names = c(NA, 
6L), class = "data.frame")
r dataframe web-scraping
1个回答
0
投票

鉴于dat=team_performance您可以执行以下操作:

library(rvest)
s=gsub("\\s","-",paste0("https://basketball.realgm.com/ncaa/conferences/",dat[,5],"/16/",
                      dat[,1],"/",dat[,6],"/stats/",dat[,2],
                      "/Averages/All/All/Season/All/minutes/desc/1/"))


A=lapply(s,function(x){
  read_html(x)%>%html_nodes("table")%>%html_table()
})
© www.soinside.com 2019 - 2024. All rights reserved.