我有一组XML文件正在阅读,并想知道处理以下内容的最佳方法:
<MyDecision>
<Decision>
<DecisionID>X1234</DecisionID>
<DecisionReasons xmlns:a="http://schemas.datacontract.org/2004/07/Contracts">
<a:Reason>
<a:Description>DOBMismatch</a:Description>
</a:Reason>
<a:Reason>
<a:Description>PrimaryChecksFail</a:Description>
</a:Reason>
<a:Reason>
<a:Description>IncomeReferral</a:Description>
</a:Reason>
</DecisionReasons>
</Decision>
</MyDecision>
目前,我正在运行一些R代码但得到响应:
Error: Duplicate identifiers for rows (2, 3, 4)
预期的输出是一个类似于以下内容的数据框:
fieldname |contents
MyDecision_Decision_DecisionID |X1234
MyDecision_Decision_DecisionReasons_Reason_Description_DOBMismatch |DOBMismatch
MyDecision_Decision_DecisionReasons_Reason_Description_PrimaryChecksFail |PrimaryChecksFail
MyDecision_Decision_DecisionReasons_Reason_Description_IncomeReferral |IncomeReferral
我目前的代码如下:
library(profvis)
library(XML)
library(xml2)
library(plyr)
library(tidyverse)
library(reshape2)
library(foreign)
library(rio)
setwd('c:/temp/xml/t')
df <- data.frame()
transposed.df1 <- data.frame()
allxmldata <- data.frame()
inputfiles <- as.character('test.xml')
findchildren<-function(nodes, df) {
numchild <- sapply(nodes, function(x){length(xml_children(x))})
xmlvalue <- xml_text(nodes[numchild==0])
xmlname <- xml_name(nodes[numchild==0])
xmlpath <- sapply(nodes[numchild==0], function(x) {gsub(', ','_', toString(rev(xml_name(xml_parents(x)))))})
if (isTRUE(xmlpath == 'MyDecision_Decision_DecisionReasons_Reason')) {
fieldname <- paste(xmlpath,xmlname,xmlvalue,sep = '_')
} else {
fieldname <- paste(xmlpath,xmlname,sep = '_')
}
contents <- sapply(xmlvalue, function(f){is.na(f)<-which(f == '');f})
dftemp <- data.frame(fieldname, contents)
df <- rbind(df, dftemp)
print(dim(df))
if (sum(numchild)>0){
findchildren(xml_children(nodes[numchild>0]), df) }
else{ return(df)}
}
for (x in inputfiles) {
df1 <- findchildren(xml_children(read_xml(x)),df)
xml.df1 <- data.frame(spread(df1, key = fieldname, value = contents), fix.empty.names = TRUE)
allxmldata <- rbind.fill(allxmldata,xml.df1)
}
我希望有人可以指出我做错了什么......
我提出了以下适用于您的示例数据集的解决方案。希望这种方法也可以用于更大的数据集。
# we need these two packages
library(xml2)
library(tidyverse)
# read in the xml-file
xml <- read_xml(
'<MyDecision>
<Decision>
<DecisionID>X1234</DecisionID>
<DecisionReasons xmlns:a="http://schemas.datacontract.org/2004/07/Contracts">
<a:Reason>
<a:Description>DOBMismatch</a:Description>
</a:Reason>
<a:Reason>
<a:Description>PrimaryChecksFail</a:Description>
</a:Reason>
<a:Reason>
<a:Description>IncomeReferral</a:Description>
</a:Reason>
</DecisionReasons>
</Decision>
</MyDecision>'
)
xml %>%
# first find all nodes that doesn't contain any child nodes
xml_find_all(".//node()[not(node())]") %>%
# find the parent of each node
map(xml_parent) %>%
# extract name and text of each of the childless nodes
map(~list(name = xml_name(.x), text = xml_text(.x))) %>%
# bind rows.
bind_rows()
这会产生以下输出:
# A tibble: 4 x 2
name text
<chr> <chr>
1 DecisionID X1234
2 Description DOBMismatch
3 Description PrimaryChecksFail
4 Description IncomeReferral