我偶然发现了一项技术测试,其中我必须计算 HTML 正文中单词的出现次数。我必须避免脚本标签和注释,这些是有问题的独特条件。 我必须成功通过看起来像这样的 chai 文件的测试。
HTML 文件:
<body>
<div id="text">
Hello <strong>world</strong>
<p class="world">This is the p 1</p>
<p class="rabbit">This is the p 2</p>
<p>This is the p 3</p>
<p>Hello world !</p>
<!-- <p>Not displayed hello world</p> -->
<p>It's 9 o'clock, I will send you an e-mail.</p>
<p>Is this the 'main'?</p>
<p>This is a multiline paragraph</p>
<pre>
<div>This is some math in HTML</div>
const n = 2;
if (1 < n && n > 4) console.log(n);
</pre>
<ul>
<li>This is right</li>
<li>This is a copyright</li>
</ul>
<script>
console.log('Hello world');
</script>
</div>
<!-- Include the function file -->
<script src="script.js"></script>
<!-- Include the test file -->
<script src="/test/test.js"></script>
<div id="mocha"></div>
<script>
mocha.setup('bdd');
</script>
<script>
mocha.run();
</script>
</body>
Chai.js 测试文件:
describe('countOccurence', function() {
it('count hello world', function() {
assert.equal(countOccurence('Hello world'), 1);
assert.equal(countOccurence('hello world'), 1);
assert.equal(countOccurence('Hello World'), 1);
assert.equal(countOccurence('Hello'), 2);
assert.equal(countOccurence('world'), 2);
});
it('count p', function() {
assert.equal(countOccurence('p'), 3);
});
it('count right', function() {
assert.equal(countOccurence('right'), 1);
});
it('count n', function() {
assert.equal(countOccurence('n'), 4);
});
it('count log', function() {
assert.equal(countOccurence('log'), 1);
assert.equal(countOccurence('console.log'), 1);
});
it('count clock', function() {
assert.equal(countOccurence("o'clock"), 1);
assert.equal(countOccurence('clock'), 0);
});
it('count e-mail', function() {
assert.equal(countOccurence('e-mail'), 1);
});
it('count sentence', function() {
assert.equal(countOccurence('This is a multiline paragraph'), 1);
assert.equal(countOccurence("Hello world ! It's 9 o'clock"), 0);
});
});
为了解决这个问题,我创建了一个函数来提取 HTML 文件的内容并删除脚本和注释。
function extractHTMLContent() {
const bodyContent = document.body.innerHTML;
const noScriptContent = bodyContent.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
const noCommentContent = noScriptContent.replace(/<!--[\s\S]*?-->/g, '');
const textContent = noCommentContent.replace(/<\/?[^>]+(>|$)/g, ' ');
return textContent;
}
然后我使用这个功能:
function countOccurence(phrase) {
if (typeof phrase !== 'string') {
throw new TypeError('Phrase needs to be a string/');
}
const bodyContent = extractHTMLContent();
const normalizedText = bodyContent.toLowerCase();
const escapedPhrase = phrase.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(`\\b${escapedPhrase}\\b`, 'gi');
console.log("Regex: ", regex);
const matches = normalizedText.match(regex);
console.log(matches);
return matches ? matches.length : 0;
}
之后,我尝试检查传递给 countOccurrence 函数的 prop 是否是一个字符串,将文本标准化为小写,转义特殊字符,构造一个正则表达式以仅匹配函数传递的字符串,然后尝试检查HTML 文件中的所有匹配项。
我无法通过“计数时钟”和“计数句子”测试,我觉得撇号是句子测试中问题的一部分,并且我的正则表达式没有按我的预期工作,并且仍在计时帐户中的字符串,即使它不是故意的。
如果你们中的一些人有一些建议,我会很高兴听到他们
做法主要是两折
至于1),为了保持某个元素节点内任何文本的正确流动顺序,必须实现一种基于递归的方法,根据其在文档中的自然位置收集所有非空文本节点。在映射任务中,对于每个文本节点,会将其任何文本值的空白序列折叠成单个空白并修剪它。最终将统一字符串数组连接成单个可搜索字符串。
至于 2),OP 的
countOccurrence
函数需要重构为一个函数,该函数期望可搜索字符串值作为其第一个参数,正则表达式或基于字符串的搜索/查询作为其第二个参数。 2a) 和 2b) 已经提到了其他必要的子任务。
// element- and text-node spcific detection-helpers.
function isNonScriptElementNode(node) {
return (
node.nodeType === 1 &&
node.tagName.toLowerCase() !== 'script'
);
}
function isNonEmptyTextNode(node) {
return (
(node.nodeType === 3)
&& (node.parentNode.tagName.toLowerCase() !== 'script')
&& (node.nodeValue.trim() !== '')
);
}
// recursive text-node specific reducer-functionality.
function collectNonEmptyTextNodeList(node) {
const result = [];
if (isNonScriptElementNode(node)) {
result.push(
...[...node.childNodes].reduce((list, childNode) =>
list.concat(collectNonEmptyTextNodeList(childNode)), []
)
);
} else if (isNonEmptyTextNode(node)) {
result.push(node)
}
return result;
}
// the OP's newly implemented occurence-count function.
function countOccurrence(text, stringOrRegExp) {
const escapeSearch = value =>
value.replace(/\s+/g, ' ').trim().replace(/[-[\]{}()*+?.,\\^$|#]/g, '\\$&');
debugger;
const regXSearch = stringOrRegExp?.test
&& stringOrRegExp
|| RegExp(`\\b${ escapeSearch(String(stringOrRegExp)) }\\b`, 'g');
return (text.match(regXSearch) ?? []).length;
}
const textNodeList = collectNonEmptyTextNodeList(document.body);
const textContent = textNodeList
.map(node =>node.textContent.replace(/\s+/g, ' ').trim())
.join(' ');
console.log({ textContent });
console.log(
"hello world' count ...", countOccurrence(textContent, 'hello world'), // 0
);
console.log(
"'Hello world' count ...", countOccurrence(textContent, 'Hello world'), // 1
);
console.log(
"'Hello World' count ...", countOccurrence(textContent, 'Hello World'), // 1
);
console.log(
"\/hello world\/ig' count ...", countOccurrence(textContent, /hello world/ig), // 2
);
console.log('\n');
console.log(
"'Hello' count ...", countOccurrence(textContent, 'Hello'), // 2
);
console.log(
"'world' count ...", countOccurrence(textContent, 'world'), // 1
);
console.log(
"'World' count ...", countOccurrence(textContent, 'World'), // 1
);
console.log(
"\/world\/ig count ...", countOccurrence(textContent, /world/ig), // 2
);
console.log('\n');
console.log(
'"o\'clock" count ...', countOccurrence(textContent, "o'clock"), // 1
);
console.log(
"'This is a multiline paragraph' count ...", countOccurrence(textContent, 'This is a multiline paragraph'), // 1
);
const search = `Hello World !
It's 9 o'clock`;
console.log(
`"${ search }" count ...`, countOccurrence(textContent, search), // 1
);
.as-console-wrapper { left: auto!important; width: 50%; min-height: 100%; }
<div id="text">
Hello <strong>world</strong>
<p class="world">This is the p 1</p>
<p class="rabbit">This is the p 2</p>
<p>This is the p 3</p>
<p>Hello World !</p>
<!-- <p>Not displayed hello world</p> -->
<p>It's 9 o'clock, I will send you an e-mail.</p>
<p>Is this the 'main'?</p>
<p>This is a multiline paragraph</p>
<pre>
<div>This is some math in HTML</div>
const n = 2;
if (1 < n && n > 4) console.log(n);
</pre>
<ul>
<li>This is right</li>
<li>This is a copyright</li>
</ul>
<script>
console.log('Hello world');
</script>
</div>