例如:通过id(getElementById)定位容器元素(div#container)将返回HTML集合,其中包含每个元素及其所有属性,包括在每个嵌套项目中重复的子节点。然后,我将每个项目迭代到一个数组中,但是在DOM树的每个级别中都剩下相同的数据。
0: <div class="container"><div><div><main><footer><div class="container-fluid"><p> © 2018-2020 Copyright: </p></div></footer></main></div></div></div>
1: <div><div><main><footer><div class="container-fluid"><p> © 2018-2020 Copyright: </p></div></footer></main></div></div>
2: <div><main><footer><div class="container-fluid"><p> © 2018-2020 Copyright: </p></div></footer></main></div>
3: <main><footer><div class="container-fluid"><p> © 2018-2020 Copyright: </p></div></footer></main>
4: <footer><div class="container-fluid"><p> © 2018-2020 Copyright: </p></div></footer>
5: <div class="container-fluid"><p> © 2018-2020 Copyright: </p></div>
6: <p> © 2018-2020 Copyright: </p>
[我想做的是只获取一次实际内容(例如<p> © 2018-2020 Copyright: </p>
)-并将其与相关的XPath位置相关联-以便稍后仅使用上面包含的结构重新组装HTML文档持有元素标签和属性;仅将内容插入节点的最后一个子节点,如下所示:
/DIV/DIV/DIV/MAIN/FOOTER/ --> `<div class="container-fluid"><p></p></div>`
/DIV/DIV/DIV/MAIN/FOOTER/DIV --> `<p></p>`
/DIV/DIV/DIV/MAIN/FOOTER/DIV/P --> © 2018-2020 Copyright:
背景/上下文:这样的目的是减少数组对象中的冗余,以便构造有效的有效负载(最终字符串化为JSON)以发送给Microsoft Translator API,这样我就不必不必要地转换相同的内容节点以重建翻译后的内容。通过使用XPath和jQuery将翻译文本响应注入回其原始DOM位置来创建页面。
到目前为止,我已经使用jQuery和TreeWalker Web API(https://developer.mozilla.org/en-US/docs/Web/API/TreeWalker)完成了到目前为止的工作...
JavaScript:
// Get all element nodes of page
var content = document.getElementById('container');
//array for DOM elements
var b = [];
function elementNodesUnder(el) {
var n;
nodeFilter = function(node) {
if (node.innerHTML && node.tagName !== 'SCRIPT' && node.tagName !==
'STYLE' && node.tagName !== 'svg' && node.tagName !== 'I' &&
node.tagName !== 'VIDEO') { return NodeFilter.FILTER_ACCEPT;
} else {
return NodeFilter.FILTER_SKIP;
}
};
walk = document.createTreeWalker(
el,
NodeFilter.SHOW_ELEMENT,
nodeFilter,
false
);
while ((n = walk.nextNode())) b.push(n);
return b;
}
elementNodesUnder(content);
console.log(b);
//array variables for xpath + innerHTML collections
var xPathArray = [];
var innerHTMLdinner = [];
//loop through text nodes & assign xPath
$.each(b, function(i, c) {
if (c.innerHTML) {
//console.log(i+" "+getElementXPath(c)+" = "+c.innerHTML);
//push each corresponding item to an array for xpath + innerHTML
xPathArray.push(getElementXPath(c));
innerHTMLdinner.push(c.innerHTML);
}
});
//map the xPath and innerHTML arrays together and then stringify
var xpathNodeMap = xPathArray.map((xPathers, index) => ({xPathArray: xPathers, innerHTML: innerHTMLdinner[index]}));
var xpathNodeMapJSON = JSON.stringify(xpathNodeMap);
console.log(xpathNodeMapJSON);
// given a document element returns the xpath string expression of that element.
function getElementXPath(elt) {
var path = '';
for (; elt && elt.nodeType == 1; elt = elt.parentNode) {
idx = getElementIdx(elt);
xname = elt.tagName;
if (idx > 1) xname += '[' + idx + ']';
path = '/' + xname + path;
}
return path;
}
function getElementIdx(elt) {
var count = 1;
for (var sib = elt.previousSibling; sib; sib = sib.previousSibling) {
if (sib.nodeType == 1 && sib.tagName == elt.tagName) count++;
}
return count;
}
HTML示例:
<html>
<body>
<div></div>
<div></div>
<div></div>
<div></div>
<div></div>
<div id="container">
<div class="layout">
<div class="bodyContainer">
<main class="wrapper">
<footer class="full-standard">
<div class="container no-print">
<div class="row">
<img alt="" src="" />
</div> <!-- footer > div.row -->
</div> <!-- /div.container.no-print -->
<div class="footer-copyright>
<div class="container-fluid">
<p>© 2020 Copyright</p>
</div> <!-- /div.container-fluid -->
</div> <!-- /div.footer-copyright -->
</footer> <!-- /footer.full-standard -->
</main> <!-- /main.wrapper -->
</div> <!-- /div.bodyContainer-->
</div> <!--/div.layout -->
</div> <!-- / div#container -->
</body>
</html>
XPath结果示例:
{
"xPathArray": "/HTML/BODY/DIV[6]/DIV/DIV/MAIN/FOOTER/DIV[2]",
"innerHTML": "<div class=\"container-fluid\"><p> © 2018-2020 Copyright: </p></div>"
},
{
"xPathArray": "/HTML/BODY/DIV[6]/DIV/DIV/MAIN/FOOTER/DIV[2]/DIV",
"innerHTML": "<p> © 2018-2020 Copyright: </p>"
},
{
"xPathArray": "/HTML/BODY/DIV[6]/DIV/DIV/MAIN/FOOTER/DIV[2]/DIV/P",
"innerHTML": " © 2018-2020 Copyright: "
}
令人惊讶的是,我还没有找到与这个问题太接近的东西,因此,如果我错过了它,我深表歉意,但是对我指出正确方向的任何帮助将深表感谢。谢谢!
尝试分配您想要唯一ID的元素-然后通过ID捕获该元素并将该元素的innerText传递给您的处理程序?
<p id='unique_id'> Some Text </p>
document.getElementbById('unique_id')[0].innerHTML
您可能需要摸索一下,但总体思路应该可行