目标:哈希页面源,以检测计划的抓取之间的更改。
Python代码:
import requests
import hashlib
url = 'http://example.org/'
r = requests.get(url, verify=False,)
r.encoding = 'utf-8'
print(hashlib.sha256(r.text.encode('utf-8')).hexdigest())
结果:ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
节点和伪造者代码:
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
console.log(crypto.createHash('sha256').update(response.text().toString()).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
结果:b4e6060006b920bc021110ea8ab8d67744983e2b7ff75e1c8be5613af93f687d
问题:
为什么会有区别?据我检查,两种方法返回相同的响应。
我能得到相同的结果吗?
是否有更好的方法来检测页面内容的变化?
您需要等待操纵up中的响应文本,否则您将对Promise { <pending> }
的字符串化版本进行哈希处理>
const puppeteer = require('puppeteer'); var crypto = require('crypto'); (async()=> { const browser= await puppeteer.launch(); const page= await browser.newPage(); try { const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 }); const source = await response.text(); console.log(crypto.createHash('sha256').update(source).digest('hex')); } catch (e) { console.log(e.message); } await browser.close(); })();
输出:
python c.py
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
node c.js
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9