From 5a5ff9f389e5455ffadcf00ff411e37f72df5b6c Mon Sep 17 00:00:00 2001 From: Felix Woestmann <felix.wostmann@edps.europa.eu> Date: Wed, 18 Dec 2024 11:49:03 +0100 Subject: [PATCH 1/6] feature: add skipHeadRequest as parameter to collector --- src/collector/browser-session.ts | 1 + src/collector/index.ts | 1 + src/commands/collectorCommand.ts | 6 ++++++ 3 files changed, 8 insertions(+) diff --git a/src/collector/browser-session.ts b/src/collector/browser-session.ts index 143d986..13f9c58 100644 --- a/src/collector/browser-session.ts +++ b/src/collector/browser-session.ts @@ -49,6 +49,7 @@ export interface BrowserArgs { sleep: number; cookies: string; seed?: string; + skipHeadRequest: boolean; } export class BrowserSession { diff --git a/src/collector/index.ts b/src/collector/index.ts index aa5d406..7673f6e 100644 --- a/src/collector/index.ts +++ b/src/collector/index.ts @@ -107,6 +107,7 @@ export class Collector { sleep: this.args.sleep, cookies: this.args.setCookie, seed: this.args.seed, + skipHeadRequest: this.args.skipHeadRequest, }; this.browserSession = new BrowserSession(browserArgs, this.logger); diff --git a/src/commands/collectorCommand.ts b/src/commands/collectorCommand.ts index e7727e2..4cee5ad 100644 --- a/src/commands/collectorCommand.ts +++ b/src/commands/collectorCommand.ts @@ -153,6 +153,12 @@ export default { .number("page-timeout") .default("page-timeout", 0) .nargs("page-timeout", 1) + .boolean("skip-head-request") + .describe( + "skip-head-request", + "Skip the initial HEAD request and directly attempt to access the resource. Useful when HEAD requests are blocked but the resource is still accessible.", + ) + .default("skip-head-request", false) .check((argv) => { let invokedAsDefaultCommand = argv._[0] !== collectorCommand; let urlPosition = invokedAsDefaultCommand ? 0 : 1; -- GitLab From 1da312fba7c37b748ac9c8b4cf71c25102811f6f Mon Sep 17 00:00:00 2001 From: Felix Woestmann <felix.wostmann@edps.europa.eu> Date: Wed, 18 Dec 2024 12:05:12 +0100 Subject: [PATCH 2/6] feature: the HEAD request checking the MIME type before browsing a website will be skipped when --no-skip-head option is provided --- src/collector/page-session.ts | 82 +++++++++++++++++++++++++---------- src/lib/proxy_config.ts | 14 +++--- 2 files changed, 66 insertions(+), 30 deletions(-) diff --git a/src/collector/page-session.ts b/src/collector/page-session.ts index 17c5cb2..84e7e01 100644 --- a/src/collector/page-session.ts +++ b/src/collector/page-session.ts @@ -14,7 +14,10 @@ import { } from "../lib/tools.js"; import parseContentSecurityPolicy from "content-security-policy-parser"; import sampleSize from "lodash/sampleSize.js"; -import { getGotProxyConfiguration } from "../lib/proxy_config.js"; +import { + getGotProxyConfiguration, + GotProxyConfiguration, +} from "../lib/proxy_config.js"; import got from "got"; import { BrowserSession, Hosts } from "./browser-session.js"; @@ -199,6 +202,53 @@ export class PageSession { } } + async shouldSkipLink( + link: string, + proxyConfig: undefined | GotProxyConfiguration, + ): Promise<boolean> { + if (this.browserSession.browserArgs.skipHeadRequest) { + return false; + } + + try { + // check mime-type and skip if not html + const head = await got(link, { + method: "HEAD", + throwHttpErrors: false, + // ignore Error: unable to verify the first certificate (https://stackoverflow.com/a/36194483) + // certificate errors should be checked in the context of the browsing and not during the mime-type check + https: { + rejectUnauthorized: false, + }, + ...(proxyConfig && { agent: proxyConfig }), + }); + + if (!head.ok) { + this.browserSession.logger + .warn(`Fetching the HEAD for ${link} unexpectedly returned HTTP status code ${head.statusCode}. + The page will be skipped. Use --skip-head-request option to disable the check.`); + return true; + } + + if (!head.headers["content-type"].startsWith("text/html")) { + this.browserSession.logger.log( + "info", + `skipping now ${link} of mime-type ${head["content-type"]}`, + { type: "Browser" }, + ); + return true; + } + + return false; + } catch (error) { + this.browserSession.logger.error( + `An error occurred while checking if ${link} should be skipped.`, + error.message, + ); + return true; + } + } + async browseSamples( page: Page, localStorage, @@ -227,31 +277,15 @@ export class PageSession { for (const link of browsing_history.slice(1)) { // can have zero iterations! - try { - // check mime-type and skip if not html - const head = await got(link, { - method: "HEAD", - // ignore Error: unable to verify the first certificate (https://stackoverflow.com/a/36194483) - // certificate errors should be checked in the context of the browsing and not during the mime-type check - https: { - rejectUnauthorized: false, - }, - ...(proxyConfig && { agent: proxyConfig }), - }); - - if (!head.headers["content-type"].startsWith("text/html")) { - this.browserSession.logger.log( - "info", - `skipping now ${link} of mime-type ${head["content-type"]}`, - { type: "Browser" }, - ); - continue; - } + if (await this.shouldSkipLink(link, proxyConfig)) { + continue; + } - this.browserSession.logger.log("info", `browsing now to ${link}`, { - type: "Browser", - }); + this.browserSession.logger.log("info", `browsing now to ${link}`, { + type: "Browser", + }); + try { await page.goto(link, { timeout: this.browserSession.browserArgs.pageLoadTimeout, waitUntil: "networkidle2", diff --git a/src/lib/proxy_config.ts b/src/lib/proxy_config.ts index f0022ad..613e34f 100644 --- a/src/lib/proxy_config.ts +++ b/src/lib/proxy_config.ts @@ -71,12 +71,14 @@ export function getChromiumProxyConfiguration(logger: Logger): string | null { return chromiumProxyConfiguration; } -export function getGotProxyConfiguration(logger: Logger): - | undefined - | { - http: HttpProxyAgent | undefined; - https: HttpsProxyAgent | undefined; - } { +export type GotProxyConfiguration = { + http: HttpProxyAgent | undefined; + https: HttpsProxyAgent | undefined; +}; + +export function getGotProxyConfiguration( + logger: Logger, +): undefined | GotProxyConfiguration { loadProxyConfiguration(logger); if (proxyConfigInstance == null) { -- GitLab From 7301729de37d0ed9c0da27a620c2fd011f701f0d Mon Sep 17 00:00:00 2001 From: Felix Woestmann <felix.wostmann@edps.europa.eu> Date: Wed, 18 Dec 2024 12:12:31 +0100 Subject: [PATCH 3/6] feature: only add links to the history that have actually been browsed --- src/collector/page-session.ts | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/collector/page-session.ts b/src/collector/page-session.ts index 84e7e01..72971c5 100644 --- a/src/collector/page-session.ts +++ b/src/collector/page-session.ts @@ -253,14 +253,17 @@ export class PageSession { page: Page, localStorage, root_uri, + // All links collected from the website and filtered for those that are considered first-party firstPartyLinks, - userSet, + // Links provided by the user which have to be browsed + userSetLinks, ) { - const preset_links = [page.url(), ...userSet]; + const preset_links = [page.url(), ...userSetLinks]; const extra_links = firstPartyLinks .map((l) => l.href) .filter((l) => !preset_links.includes(l)); - const random_links = this.browserSession.browserArgs.seed + + const randomLinks = this.browserSession.browserArgs.seed ? sampleSizeSeeded( extra_links, this.browserSession.browserArgs.linkLimit - preset_links.length, @@ -271,11 +274,13 @@ export class PageSession { this.browserSession.browserArgs.linkLimit - preset_links.length, ); // can be empty! - const browsing_history = [root_uri, ...userSet, ...random_links]; + let linksToBrowse = [...userSetLinks, ...randomLinks]; + + let linksBrowsed = []; let proxyConfig = getGotProxyConfiguration(this.browserSession.logger); - for (const link of browsing_history.slice(1)) { + for (const link of linksToBrowse) { // can have zero iterations! if (await this.shouldSkipLink(link, proxyConfig)) { continue; @@ -285,6 +290,8 @@ export class PageSession { type: "Browser", }); + linksBrowsed.push(link); + try { await page.goto(link, { timeout: this.browserSession.browserArgs.pageLoadTimeout, @@ -307,7 +314,7 @@ export class PageSession { ); } - return browsing_history; + return [root_uri, ...linksBrowsed]; } async takeScreenshots() { -- GitLab From 9b51aebf0b6fd43630bb68f9528c68ce7216c46d Mon Sep 17 00:00:00 2001 From: Felix Woestmann <felix.wostmann@edps.europa.eu> Date: Wed, 18 Dec 2024 12:26:02 +0100 Subject: [PATCH 4/6] refactor: reuse gotoPage function --- src/collector/index.ts | 4 ++++ src/collector/page-session.ts | 23 +++++++++-------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/collector/index.ts b/src/collector/index.ts index 7673f6e..e64f9ef 100644 --- a/src/collector/index.ts +++ b/src/collector/index.ts @@ -131,6 +131,10 @@ export class Collector { const response = await this.pageSession.gotoPage(url); + if (response == null) { + process.exit(2); + } + // log redirects this.output.uri_redirects = response .request() diff --git a/src/collector/page-session.ts b/src/collector/page-session.ts index 72971c5..209ab48 100644 --- a/src/collector/page-session.ts +++ b/src/collector/page-session.ts @@ -1,4 +1,4 @@ -import { Page } from "puppeteer"; +import { HTTPResponse, Page } from "puppeteer"; import url from "url"; import escapeRegExp from "lodash/escapeRegExp.js"; import { CookieRecorder } from "./recorder/cookie-recorder.js"; @@ -179,17 +179,18 @@ export class PageSession { }); } - async gotoPage(u) { - this.browserSession.logger.log("info", `browsing now to ${u}`, { + async gotoPage(url): Promise<HTTPResponse | null> { + this.browserSession.logger.log("info", `browsing now to ${url}`, { type: "Browser", }); try { - let page_response = await this.page.goto(u, { + let page_response = await this.page.goto(url, { timeout: this.browserSession.browserArgs.pageLoadTimeout, waitUntil: "networkidle2", }); if (page_response === null) { + // https://github.com/puppeteer/puppeteer/issues/2479#issuecomment-408263504 page_response = await this.page.waitForResponse(() => true); } @@ -198,7 +199,8 @@ export class PageSession { this.browserSession.logger.log("error", error.message, { type: "Browser", }); - process.exit(2); + + return null; } } @@ -292,15 +294,8 @@ export class PageSession { linksBrowsed.push(link); - try { - await page.goto(link, { - timeout: this.browserSession.browserArgs.pageLoadTimeout, - waitUntil: "networkidle2", - }); - } catch (error) { - this.browserSession.logger.log("warn", error.message, { - type: "Browser", - }); + let response = await this.gotoPage(link); + if (response == null) { continue; } -- GitLab From 910b84284f6250008be46ede1fd9528ea5ff8d57 Mon Sep 17 00:00:00 2001 From: Felix Woestmann <felix.wostmann@edps.europa.eu> Date: Wed, 18 Dec 2024 12:52:02 +0100 Subject: [PATCH 5/6] fix: error in .gitlab-ci.yml --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 297cba8..2a63592 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -21,7 +21,7 @@ build-job: - cd build/ - npm pack --pack-destination='../artifacts' - cd ../artifacts - - mv website-evidence-collector-$(PACKAGE_VERSION).tgz website-evidence-collector.tgz + - mv website-evidence-collector-$PACKAGE_VERSION.tgz website-evidence-collector.tgz artifacts: paths: - build/artifacts -- GitLab From 8e0c56c96720bdaa96c15ffc2fc960337e4e663e Mon Sep 17 00:00:00 2001 From: Felix Woestmann <felix.wostmann@edps.europa.eu> Date: Wed, 15 Jan 2025 12:26:44 +0100 Subject: [PATCH 6/6] update: CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 307726c..11c6e46 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## HEAD +- **Added option --skip-head-request**. When surfing additional links, the software checks the MIME type using a HEAD request to skip PDFs, videos, etc. With this new option enabled,the HEAD request is skipped. This option is intended for cases where a HEAD request fails (e.g., when blocked) but a normal request would succeed. +- The browsing history in the report now only includes sites actually visited by the software. Previously, it also included sites that were filtered out due to their MIME type. + ## 3.0.0 / 2024-11-27 - BREAKING CHANGES: - Delete `reporter` script and move functionality to subcommand of WEC reachable under `website-evidence-collector reporter` -- GitLab