我正在使用 REST API 通过以下代码从 Azure 表检索数据:
library(httr)
library(RCurl)
library(bitops)
library(xml2)
# Stores credentials in variable
Account <- "storageaccount"
Container <- "Usage"
Key <- "key"
# Composes URL
URL <- paste0(
"https://",
Account,
".table.core.windows.net",
"/",
Container
)
# Requests time stamp
requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
# As per Microsoft's specs, an empty line is needed for content-length
content_lenght <- 0
# Composes signature string
signature_string <- paste0(
"GET", "\n", # HTTP Verb
"\n", # Content-MD-5
"text/xml", "\n", # Content-Type
requestdate, "\n", # Date
"/", Account, "/", Container # Canonicalized resource
)
# Composes header string
header_string <- add_headers(
Authorization=paste0(
"SharedKey ",
Account,
":",
RCurl::base64(
digest::hmac(
key = RCurl::base64Decode(
Key, mode = "raw"
),
object = enc2utf8(signature_string),
algo = "sha256",
raw = TRUE
)
)
),
'x-ms-date' = requestdate,
'x-ms-version' = "2020-12-06",
'Content-type' = "text/xml"
)
# Creates request
xml_body = content(
GET(
URL,
config = header_string,
verbose()
),
"text"
)
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Table_name <- as.data.frame(From_JSON) # Saves data to a table
我现在可以查看表格,但我注意到我只能看到前 1000 行。实现检索所有剩余行并更新表的循环/循环的最有效方法是什么?
我需要能够处理整个数据集。
还要考虑到该表每天将更新约 40,000 行,因此保持视觉效果与数据同步是一个问题。
不确定如何在
R
中具体实现这一点,但这是一般方法:
当您列出表中的实体时,单个请求中最多返回 1000 个实体。如果表包含超过 1000 个实体,表服务将返回两个附加标头:
x-ms-continuation-NextPartitionKey
和 x-ms-continuation-NextRowKey
。这两个标头的存在表明有更多数据可供您获取。
您需要做的是使用这些标头并在下一个请求 URL 中指定两个查询参数:
NextPartitionKey
和 NextRowKey
。所以你的请求会是这样的:
https://account.table.core.windows.net/Table?NextPartitionKey=<x-ms-continuation-NextPartitionKey header value>&NextRowKey=<x-ms-continuation-NextRowKey header value>
.
您需要重复该过程,直到您在响应中没有收到这些标头。
您可以在这里了解更多信息:https://learn.microsoft.com/en-us/rest/api/storageservices/query-timeout-and-pagination。
感谢您的提示!我已经整理了一些代码...不幸的是,我没有通过第一个循环(检索第 3 页),而且我不太明白为什么。
我通过做出一些假设来编写它,例如元数据始终以相同的结构返回。
这是代码:
library(httr)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
# Retrieves metadata
Get_headers <- capture.output(
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message")
Server_response <- Get_headers[11] %>%
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
# Initializes variables
Pages <- 0
Next_headers_count <- 0
# Fetches data only if authentication was successful
if (Server_response = TRUE) {
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Table_name <- as.data.frame(From_JSON) # Saves data to a table
Pages <- Pages + 1 # One page of data has been retrieved
# Checks if there are more than 1000 rows to be fetched
x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
x_ms_continuation_NextRowKey <- Get_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
# Starts loop to retrieve additional data
while (x_ms_continuation_NextPartitionKey = TRUE &
x_ms_continuation_NextRowKey = TRUE) {
Pages <- Pages + 1 # Counts the number of pages retrieved, including the initial page
Next_headers_count <- Next_headers_count +1 # Counts the number of Next headers passed by the metadata
Next_Partition_Key <- Get_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_URL <- paste0( # Creates the URL for the Next Authentication token
"https://",
Account,
".table.core.windows.net",
"/",
Container,
"?",
"NextPartitionKey=",
Next_Partition_Key,
"&NextRowKey=",
Next_Row_key
)
next_xml_body = content( # Retrieves next 1000 rows of content from table
GET(
Next_URL,
config = header_string,
verbose()
),
"text"
)
Get_new_data <- next_xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_new_data, flatten = TRUE) # Parses text from JSON
Temp_table_name <- as.data.frame(From_JSON) # Saves data to a table
Table_name <- bind_rows(Temp_table_name, Table_name) # Appends new data to the initial data
Get_new_headers <- capture.output( # Retrieves new next headers
content(
GET(
Next_URL,
config = header_string,
verbose()
)
),
type = "message")
New_server_response <- Get_new_headers[11] %>%
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
# Checks if there are more than 1000 rows to be fetched
New_x_ms_continuation_NextPartitionKey <- Get_new_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
New_x_ms_continuation_NextRowKey <- Get_new_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
x_ms_continuation_NextPartitionKey <- New_x_ms_continuation_NextPartitionKey
x_ms_continuation_NextRowKey <- New_x_ms_continuation_NextRowKey
Next_Partition_Key <- Get_new_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_new_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
}
} else {print("authentication failed")}
# Previews table
Pages
Next_headers_count
View(Table_name)
这样,我只能检索 2000 个条目。当下一个周期开始时,它会失败。看来这里失败了:
Get_new_headers <- capture.output( # Retrieves new next headers
content(
GET(
Next_URL,
config = header_string,
verbose()
)
),
type = "message")
任何帮助将不胜感激!
我想通了...我改进了脚本,现在它可以工作了。 :-)
代码可以进一步细化,但这是一个工作脚本,可以迭代地从表中检索所有数据。
Connects to an Azure Table based on the specifications for Shared Key: https://learn.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key
library(httr)
library(RCurl)
library(bitops)
library(xml2)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)
# Stores credentials in variable
Account <- "storage"
Container <- "Usage"
Key <- "key"
# Composes URL
URL <- paste0(
"https://",
Account,
".table.core.windows.net",
"/",
Container
)
# Requests time stamp
requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")
# As per Microsoft's specs, an empty line is needed for content-length
content_lenght <- 0
# Composes signature string
signature_string <- paste0(
"GET", "\n", # HTTP Verb
"\n", # Content-MD-5
"text/xml", "\n", # Content-Type
requestdate, "\n", # Date
"/", Account, "/", Container # Canonicalized resource
)
# Composes header string
header_string <- add_headers(
Authorization=paste0(
"SharedKey ",
Account,
":",
RCurl::base64(
digest::hmac(
key = RCurl::base64Decode(
Key, mode = "raw"
),
object = enc2utf8(signature_string),
algo = "sha256",
raw = TRUE
)
)
),
'x-ms-date' = requestdate,
'x-ms-version' = "2020-12-06",
'Content-type' = "text/xml"
)
# Calls
Get_headers <- capture.output( # Retrieves metadata
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message"
)
Server_response <- Get_headers[11] %>% # Retrieves server response
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
Get_headers
Server_response
# Initializes counters
Pages <- 0
Next_headers_count <- 0
while(isTRUE(Server_response)) {
Pages <- Pages + 1
xml_body <- content( # Retrieves up to 1000 rows from the table
GET(
URL,
config = header_string,
verbose()
),
"text"
)
Get_data <- xml_body # Gets data as text from API
From_JSON <-fromJSON(Get_data, flatten = TRUE) # Parses text from JSON
Temp_table_name <- as.data.frame(From_JSON) # Saves current rows to temp table
Table_name <- bind_rows(Temp_table_name, Table_name) # Appends new data to the initial data
# Checks if there are more than 1000 rows to be fetched
x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%
trimws( whitespace = "<- ") %>%
gsub("\\.*", "x-ms-continuation-NextPartitionKey") %>%
grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
x_ms_continuation_NextRowKey <- Get_headers[20] %>%
trimws( whitespace = "<- ") %>%
gsub("\\.*", "x-ms-continuation-NextRowKey") %>%
grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
x_ms_continuation_NextPartitionKey
x_ms_continuation_NextRowKey
if (isTRUE(x_ms_continuation_NextPartitionKey) &
isTRUE(x_ms_continuation_NextRowKey)) {
Next_headers_count <- Next_headers_count + 1
Next_Partition_Key <- Get_headers[19] %>% # Extracts the value of the Next Partition Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
Next_Row_key <- Get_headers[20] %>% # Extracts the value of the Next Row Key
str_remove(".+(?= )") %>%
trimws( whitespace =" ") %>%
trimws( whitespace = "\r")
URL <- paste0( # Creates the URL for the Next Authentication token
"https://",
Account,
".table.core.windows.net",
"/",
Container,
"?",
"NextPartitionKey=",
Next_Partition_Key,
"&NextRowKey=",
Next_Row_key
)
Get_headers <- capture.output( # Retrieves new metadata
content(
GET(
URL,
config = header_string,
verbose()
)
),
type = "message"
)
Server_response <- Get_headers[11] %>% # Retrieves new server response
trimws( whitespace = "\r") %>%
trimws( whitespace = "<- ") %>%
grepl("HTTP/1.1 200 OK")
}
}
Pages
Next_headers_count
View(Table_name)