如何启用/实现对任何 DOM 节点或整个文档的文本内容的全文搜索?

问题描述 投票:0回答:1

我偶然发现了一项技术测试,其中我必须计算 HTML 正文中单词的出现次数。我必须避免脚本标签和注释,这些是有问题的独特条件。 我必须成功通过看起来像这样的 chai 文件的测试。

HTML 文件:

<body>
  <div id="text">
    Hello <strong>world</strong>
    <p class="world">This is the p 1</p>
    <p class="rabbit">This is the p 2</p>
    <p>This is the p 3</p>
    <p>Hello world !</p>
    <!-- <p>Not displayed hello world</p> -->
    <p>It's 9 o'clock, I will send you an e-mail.</p>
    <p>Is this the 'main'?</p>
    <p>This is a multiline paragraph</p>
    <pre>
      <div>This is some math in HTML</div>
      const n = 2;
      if (1 < n && n > 4) console.log(n);
    </pre>
    <ul>
      <li>This is right</li>
      <li>This is a copyright</li>
    </ul>
    <script>
      console.log('Hello world');
    </script>
  </div>

  <!-- Include the function file -->
  <script src="script.js"></script>
  <!-- Include the test file -->
  <script src="/test/test.js"></script>

  <div id="mocha"></div>
  <script>
    mocha.setup('bdd');
  </script>
  <script>
    mocha.run();
  </script>
</body>

Chai.js 测试文件:

describe('countOccurence', function() {
  it('count hello world', function() {
    assert.equal(countOccurence('Hello world'), 1);
    assert.equal(countOccurence('hello world'), 1);
    assert.equal(countOccurence('Hello World'), 1);
    assert.equal(countOccurence('Hello'), 2);
    assert.equal(countOccurence('world'), 2);
  });

  it('count p', function() {
    assert.equal(countOccurence('p'), 3);
  });

  it('count right', function() {
    assert.equal(countOccurence('right'), 1);
  });

  it('count n', function() {
    assert.equal(countOccurence('n'), 4);
  });

  it('count log', function() {
    assert.equal(countOccurence('log'), 1);
    assert.equal(countOccurence('console.log'), 1);
  });

  it('count clock', function() {
    assert.equal(countOccurence("o'clock"), 1);
    assert.equal(countOccurence('clock'), 0);
  });

  it('count e-mail', function() {
    assert.equal(countOccurence('e-mail'), 1);
  });

  it('count sentence', function() {
    assert.equal(countOccurence('This is a multiline paragraph'), 1);
    assert.equal(countOccurence("Hello world ! It's 9 o'clock"), 0);
  });
});

为了解决这个问题,我创建了一个函数来提取 HTML 文件的内容并删除脚本和注释。

function extractHTMLContent() {
  const bodyContent = document.body.innerHTML;

  const noScriptContent = bodyContent.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');

  const noCommentContent = noScriptContent.replace(/<!--[\s\S]*?-->/g, '');

  const textContent = noCommentContent.replace(/<\/?[^>]+(>|$)/g, ' ');

  return textContent;
}

然后我使用这个功能:

function countOccurence(phrase) {

  if (typeof phrase !== 'string') {
    throw new TypeError('Phrase needs to be a string/');
  }

  const bodyContent = extractHTMLContent();

  const normalizedText = bodyContent.toLowerCase();

  const escapedPhrase = phrase.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');

  const regex = new RegExp(`\\b${escapedPhrase}\\b`, 'gi');

  console.log("Regex: ", regex);


  const matches = normalizedText.match(regex);
  console.log(matches);
  return matches ? matches.length : 0;
}

之后,我尝试检查传递给 countOccurrence 函数的 prop 是否是一个字符串,将文本标准化为小写,转义特殊字符,构造一个正则表达式以仅匹配函数传递的字符串,然后尝试检查HTML 文件中的所有匹配项。

我无法通过“计数时钟”和“计数句子”测试,我觉得撇号是句子测试中问题的一部分,并且我的正则表达式没有按我的预期工作,并且仍在计时帐户中的字符串,即使它不是故意的。

如果你们中的一些人有一些建议,我会很高兴听到他们

javascript regex dom full-text-search textnode
1个回答
0
投票

做法主要是两折

  1. 将所有文本内容提取到统一的可搜索字符串中,同时保留正确的文本流(顺序)。
  2. 从提供的搜索/查询字符串创建一个正则表达式,其中文本需要根据之前提取的文本内容的统一符规则进行 a) 统一,并且 b) 还需要转义所有出现的正则表达式特定字符。

至于1),为了保持某个元素节点内任何文本的正确流动顺序,必须实现一种基于递归的方法,根据其在文档中的自然位置收集所有非空文本节点。在映射任务中,对于每个文本节点,会将其任何文本值的空白序列折叠成单个空白并修剪它。最终将统一字符串数组连接成单个可搜索字符串。

至于 2),OP 的

countOccurrence
函数需要重构为一个函数,该函数期望可搜索字符串值作为其第一个参数,正则表达式或基于字符串的搜索/查询作为其第二个参数。 2a)2b) 已经提到了其他必要的子任务。

// element- and text-node spcific detection-helpers.

function isNonScriptElementNode(node) {
  return (
    node.nodeType === 1 &&
    node.tagName.toLowerCase() !== 'script'
  );
}
function isNonEmptyTextNode(node) {
  return (
       (node.nodeType === 3)
    && (node.parentNode.tagName.toLowerCase() !== 'script')
    && (node.nodeValue.trim() !== '')
  );
}

// recursive text-node specific reducer-functionality.
function collectNonEmptyTextNodeList(node) {
  const result = [];

  if (isNonScriptElementNode(node)) {

    result.push(
      ...[...node.childNodes].reduce((list, childNode) =>

        list.concat(collectNonEmptyTextNodeList(childNode)), []
      )
    );
  } else if (isNonEmptyTextNode(node)) {

    result.push(node)
  }
  return result;
}

// the OP's newly implemented occurence-count function.
function countOccurrence(text, stringOrRegExp) {
  const escapeSearch = value =>
    value.replace(/\s+/g, ' ').trim().replace(/[-[\]{}()*+?.,\\^$|#]/g, '\\$&');
debugger;
  const regXSearch = stringOrRegExp?.test
    && stringOrRegExp
    || RegExp(`\\b${ escapeSearch(String(stringOrRegExp)) }\\b`, 'g');

  return (text.match(regXSearch) ?? []).length;
}


const textNodeList = collectNonEmptyTextNodeList(document.body);

const textContent = textNodeList
  .map(node =>node.textContent.replace(/\s+/g, ' ').trim())
  .join(' ');


console.log({ textContent });

console.log(
  "hello world' count ...", countOccurrence(textContent, 'hello world'), // 0
);
console.log(
  "'Hello world' count ...", countOccurrence(textContent, 'Hello world'), // 1
);
console.log(
  "'Hello World' count ...", countOccurrence(textContent, 'Hello World'), // 1
);
console.log(
  "\/hello world\/ig' count ...", countOccurrence(textContent, /hello world/ig), // 2
);
console.log('\n');

console.log(
  "'Hello' count ...", countOccurrence(textContent, 'Hello'), // 2
);
console.log(
  "'world' count ...", countOccurrence(textContent, 'world'), // 1
);
console.log(
  "'World' count ...", countOccurrence(textContent, 'World'), // 1
);
console.log(
  "\/world\/ig count ...", countOccurrence(textContent, /world/ig), // 2
);
console.log('\n');

console.log(
  '"o\'clock" count ...', countOccurrence(textContent, "o'clock"), // 1
);
console.log(
  "'This is a multiline paragraph' count ...", countOccurrence(textContent, 'This is a multiline paragraph'), // 1
);
const search = `Hello World !

It's 9 o'clock`;

console.log(
  `"${ search }" count ...`, countOccurrence(textContent, search), // 1
);
.as-console-wrapper { left: auto!important; width: 50%; min-height: 100%; }
<div id="text">
  Hello <strong>world</strong>
  <p class="world">This is the p 1</p>
  <p class="rabbit">This is the p 2</p>
  <p>This is the p 3</p>
  <p>Hello World !</p>
  <!-- <p>Not displayed hello world</p> -->
  <p>It's 9 o'clock, I will send you an e-mail.</p>
  <p>Is this the 'main'?</p>
  <p>This is a multiline paragraph</p>
  <pre>
    <div>This is some math in HTML</div>
    const n = 2;
    if (1 &lt; n && n &gt; 4) console.log(n);
  </pre>
  <ul>
    <li>This is right</li>
    <li>This is a copyright</li>
  </ul>
  <script>
    console.log('Hello world');
  </script>
</div>

© www.soinside.com 2019 - 2024. All rights reserved.