我正在尝试读取和解析 XML 文件并将其转换为数据框对象,但每次尝试都在列表级别返回“character[0]”,而列表级别预计返回字符串或数字。此外,当尝试调试 read_xml 函数时,控制台不断打印(并且在停止调试模式时不会停止)以下错误:
INTEGER() can only be applied to a 'integer', not a 'unknown type #29'
问题可能是什么原因造成的?我可以传递给 read_xml 函数的其他参数是否会影响它读取值和字符串的方式?
这是我正在运行的完整脚本:
library(dplyr)
library (xml2)
test <- read_xml("path/XMLbulletinsample.xml", encoding = "UTF-8")
ns <- xml_ns(test)
reports <- xml_find_all(test, ".//d3:PricRpt", ns = ns)
# Initializing an empty list to store data
data_list <- list()
# Looping through each PricRpt node and extract information
for (report in reports) {
# Extract trade date, security ID, and various price attributes
trad_dt <- xml_text(xml_find_first(report, ".//d3:TradDt/Dt"))
ticker <- xml_text(xml_find_first(report, ".//d3:SctyId/TckrSymb"))
open_interest <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/OpnIntrst"))
first_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/FrstPric"))
min_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/MinPric"))
max_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/MaxPric"))
avg_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/TradAvrgPric"))
last_price <- xml_text(xml_find_first(report, ".//d3:FinInstrmAttrbts/LastPric"))
# Add extracted data to the list
data_list[[length(data_list) + 1]] <- data.frame(
trad_dt = trad_dt,
ticker = ticker,
open_interest = as.numeric(open_interest),
first_price = as.numeric(first_price),
min_price = as.numeric(min_price),
max_price = as.numeric(max_price),
avg_price = as.numeric(avg_price),
last_price = as.numeric(last_price),
stringsAsFactors = FALSE
)
}
# Combine the list into a single dataframe
df <- bind_rows(data_list)
这是我从完整 XML 文件中获取的示例,以便在创建解析脚本时进行处理:
<?xml version="1.0" encoding="utf-8"?>
<Document xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:bvmf.052.01.xsd bvmf.052.01.xsd" xmlns="urn:bvmf.052.01.xsd">
<BizFileHdr>
<Xchg>
<BizGrpDesc>
<Fr>
<OrgId>
<Id>
<OrgId>
<Othr>
<Id>BVMF</Id>
<Issr>40</Issr>
<SchmeNm>
<Prtry>39</Prtry>
</SchmeNm>
</Othr>
</OrgId>
</Id>
</OrgId>
</Fr>
<To>
<OrgId>
<Id>
<OrgId>
<Othr>
<Id>PUBLIC</Id>
<Issr>40</Issr>
<SchmeNm>
<Prtry>39</Prtry>
</SchmeNm>
</Othr>
</OrgId>
</Id>
</OrgId>
</To>
<BizGrpDtls>
<BizGrpIdr>BV000471202408300001000071943367860</BizGrpIdr>
<TtlNbOfMsg>2264</TtlNbOfMsg>
<BizGrpTp>BVBG.187.01</BizGrpTp>
<CreDtAndTm>2024-08-30T19:43:36</CreDtAndTm>
</BizGrpDtls>
<MsgTpDef>
<MsgDefIdr>BVMF.217.01</MsgDefIdr>
<NbOfMsg>2264</NbOfMsg>
</MsgTpDef>
</BizGrpDesc>
<BizGrp>
<AppHdr xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:iso:std:iso:20022:tech:xsd:head.001.001.01">
<BizMsgIdr>BV000471202408300001000071956346330</BizMsgIdr>
<MsgDefIdr>BVMF.217.01</MsgDefIdr>
<CreDt>2024-08-30T22:56:34Z</CreDt>
<Fr>
<OrgId>
<Id>
<OrgId>
<Othr>
<Id>BVMF</Id>
<SchmeNm>
<Prtry>39</Prtry>
</SchmeNm>
<Issr>40</Issr>
</Othr>
</OrgId>
</Id>
</OrgId>
</Fr>
<To>
<OrgId>
<Id>
<OrgId>
<Othr>
<Id>PUBLIC</Id>
<SchmeNm>
<Prtry>39</Prtry>
</SchmeNm>
<Issr>40</Issr>
</Othr>
</OrgId>
</Id>
</OrgId>
</To>
</AppHdr>
<Document xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:bvmf.217.01.xsd">
<PricRpt>
<TradDt>
<Dt>2024-08-30</Dt>
</TradDt>
<SctyId>
<TckrSymb>CCMU24</TckrSymb>
</SctyId>
<FinInstrmId>
<OthrId>
<Id>200001038816</Id>
<Tp>
<Prtry>8</Prtry>
</Tp>
</OthrId>
<PlcOfListg>
<MktIdrCd>BVMF</MktIdrCd>
</PlcOfListg>
</FinInstrmId>
<TradDtls/>
<FinInstrmAttrbts>
<OpnIntrst>35614</OpnIntrst>
<FrstPric Ccy="BRL">60.99</FrstPric>
<MinPric Ccy="BRL">60.75</MinPric>
<MaxPric Ccy="BRL">61.7</MaxPric>
<TradAvrgPric Ccy="BRL">61.46</TradAvrgPric>
<LastPric Ccy="BRL">61.7</LastPric>
<RglrTxsQty>6697</RglrTxsQty>
<AdjstdQt Ccy="BRL">61.61</AdjstdQt>
<AdjstdQtStin>F</AdjstdQtStin>
<PrvsAdjstdQt Ccy="BRL">61.1</PrvsAdjstdQt>
<PrvsAdjstdQtStin>F</PrvsAdjstdQtStin>
</FinInstrmAttrbts>
</PricRpt>
</Document>
</BizGrp>
</Xchg>
</BizFileHdr>
</Document>
这是我的 R、RStudio 和软件包信息:
platform x86_64-w64-mingw32
version.string R version 4.4.1 (2024-06-14 ucrt)
rstudio 2024.09.0+375 "Cranberry Hibiscus" Release (c8fc7aee6dc218d5687553f9041c6b1e5ea268ff, 2024-09-16)
xml2 version xml2_1.3.6
dplyr version dplyr_1.1.4
您有 2 个选择。
要么将名称空间添加到所有子节点,例如:
xml_text(xml_find_first(report, ".//d3:TradDt/d3:Dt"))
或使用
xml_ns_strip()
函数并消除在 xml_find 函数中使用命名空间的需要。
xml_ns_strip(test)
reports <- xml_find_all(test, ".//PricRpt")
xml_text(xml_find_first(reports, ".//TradDt/Dt"))