我在数据库中存储了 HTML 字符串,我只想提取文本内容,去掉所有 HTML 标签。在 PHP 中执行此操作的最佳方法是什么?
我想提取纯文本。我怎样才能实现这个目标?
您可以使用 PHP 的 DOMDocument 类来解析 HTML 并提取文本内容。方法如下:
<?php
require_once 'includes/connection.inc.php';
function removeElementsByTagName($tagName, $document)
{
$nodeList = $document->getElementsByTagName($tagName);
for ($nodeIdx = $nodeList->length; --$nodeIdx >= 0;) {
$node = $nodeList->item($nodeIdx);
$node->parentNode->removeChild($node);
}
}
$fetch_data = "SELECT * FROM emails";
$fetch_data_conn = mysqli_query($connection, $fetch_data);
while ($row = mysqli_fetch_assoc($fetch_data_conn)) {
$content = $row["content"];
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($content);
libxml_clear_errors();
removeElementsByTagName('script', $doc);
removeElementsByTagName('style', $doc);
removeElementsByTagName('link', $doc);
$textContent = strip_tags($doc->saveHTML());
echo $textContent;
echo "<hr>";
}
?>
此代码使用 DOMDocument 解析 HTML 和 strip_tags 来删除任何剩余的标签,只留下纯文本内容。
<?php
require_once 'includes/connection.inc.php';
// Helper function to remove all elements with a specific tag name
function removeElementsByTagName($tagName, $document) {
$nodeList = $document->getElementsByTagName($tagName);
for ($nodeIdx = $nodeList->length; --$nodeIdx >= 0;) {
$node = $nodeList->item($nodeIdx);
$node->parentNode->removeChild($node);
}
}
// Fetch emails from the database
$fetch_data = "SELECT * FROM emails";
$fetch_data_conn = mysqli_query($connection, $fetch_data);
while ($row = mysqli_fetch_assoc($fetch_data_conn)) {
$content = $row["content"];
// Create a new DOMDocument object
$doc = new DOMDocument();
// Suppress HTML parsing errors
libxml_use_internal_errors(true);
// Load the HTML content
$doc->loadHTML($content);
// Clear error messages
libxml_clear_errors();
// Remove unwanted tags
removeElementsByTagName('script', $doc);
removeElementsByTagName('style', $doc);
removeElementsByTagName('link', $doc);
// Remove remaining HTML tags and get plain text
$textContent = strip_tags($doc->saveHTML());
// Print the result
echo $textContent;
echo "<hr>";
}
?>