Najprostszy approach: pobierz tekst, hashuj, compare do poprzedniego hash. Działa świetnie dla "czy treść strony się zmieniła":
import { chromium } from 'playwright';
import crypto from 'node:crypto';
async function getContentHash(url, selector) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle' });
const text = await page.locator(selector).innerText();
await browser.close();
// Normalize whitespace, strip dynamic timestamps
const normalized = text.replace(/\s+/g, ' ').trim();
return {
hash: crypto.createHash('sha256').update(normalized).digest('hex'),
text: normalized,
};
}
async function checkChanges(url, selector) {
const current = await getContentHash(url, selector);
const prev = await db.query(
'SELECT hash, text FROM content_history WHERE url=$1 ORDER BY checked_at DESC LIMIT 1',
[url]
);
if (prev.rows.length === 0 || prev.rows[0].hash !== current.hash) {
await alertChange(url, prev.rows[0]?.text, current.text);
await db.query(
'INSERT INTO content_history (url, hash, text, checked_at) VALUES ($1, $2, $3, NOW())',
[url, current.hash, current.text]
);
}
}
Pitfall: dynamic content (timestamps, ad slots, randomized order) generuje false positives. Filter selector mocno (np. main article nie body).