早安,
我需要从不同网站的大约 2000 个 url 中提取主要产品的价格,而我需要的一般 html 模式不存在,在共享图像中你可以看到每个价格都没有典型的“”,这对应于几乎 60% 的 url,因此我必须将任何捕获视为非结构化 html 源。
共享代码使用了一种基本技术,即使用过滤器捕获整个身体的模仿文本,最后是一个正则表达式,它给了我 5% 的一致性,我的工作不是很有用。
我想知道是否使用机器学习和训练模型使用自然语言处理 (NLP) 和机器视觉等技术从新的 HTML 页面中提取价格是我需要的唯一选择。这对我来说是新事物,很难实现。
const scraphtml = (url, title) => {
return new Promise((resolve, reject) => {
axios.get(url)
.then(response => {
// Load only the HTML string of the body in Cheerio
const $ = cheerio.load(response.data);
// Get the content of the body
const bodyContent = $('body').html();
// Create a new instance of Cheerio with the content of the body
const $body = cheerio.load(bodyContent);
// Remove all tags and their content with no relevant text
$body('script, noscript, style, form').remove();
// Create an array to hold all the attributes
const attributes = [];
// Loop over each HTML tag and get its attributes
$body('*').each(function () {
const tagAttributes = Object.keys($body(this).attr()).filter(attr => !attributes.includes(attr));
attributes.push(...tagAttributes);
});
// Select all HTML tags and remove their attributes
$body('*').removeAttr(...attributes);
// Text resulting from Cheerio
const text = $body.text().toLowerCase();
// console.log(text);
// We reduce the text by removing spaces and line breaks
shorthtml = text.replace(/(\r\n|\n|\r)/gm, ' ').replace(/\s+/g, ' ').trim();
// console.log("short text >>>>>>>>>>>>> " + shorthtml);
// We adapt the "title" by replacing the spaces with "\s" because JavaScript doesn't support lookahead assertions with variable length patterns.
const titleBlock = title.replace(/\s/g, "\\s");
// We achieve capture pattern only leading price number.
const pattern = new RegExp(`\\b(${titleBlock}\\s.*?)\\$(\\d{1,3}(?:[.,]\\d{3})*[ ,.]\\d{2})`, "g");
let match;
let price;
while ((match = pattern.exec(shorthtml)) !== null) {
price = match[2]; // Captured number part
console.log("price inside the while: " + price);
break;
};
console.log("price outside while: " + price);
resolve(price);
})
.catch(error => {
console. error(error);
reject(error);
});
});
};