From 5a5ff9f389e5455ffadcf00ff411e37f72df5b6c Mon Sep 17 00:00:00 2001
From: Felix Woestmann <felix.wostmann@edps.europa.eu>
Date: Wed, 18 Dec 2024 11:49:03 +0100
Subject: [PATCH 1/6] feature: add skipHeadRequest as parameter to collector

---
 src/collector/browser-session.ts | 1 +
 src/collector/index.ts           | 1 +
 src/commands/collectorCommand.ts | 6 ++++++
 3 files changed, 8 insertions(+)

diff --git a/src/collector/browser-session.ts b/src/collector/browser-session.ts
index 143d986..13f9c58 100644
--- a/src/collector/browser-session.ts
+++ b/src/collector/browser-session.ts
@@ -49,6 +49,7 @@ export interface BrowserArgs {
   sleep: number;
   cookies: string;
   seed?: string;
+  skipHeadRequest: boolean;
 }
 
 export class BrowserSession {
diff --git a/src/collector/index.ts b/src/collector/index.ts
index aa5d406..7673f6e 100644
--- a/src/collector/index.ts
+++ b/src/collector/index.ts
@@ -107,6 +107,7 @@ export class Collector {
       sleep: this.args.sleep,
       cookies: this.args.setCookie,
       seed: this.args.seed,
+      skipHeadRequest: this.args.skipHeadRequest,
     };
     this.browserSession = new BrowserSession(browserArgs, this.logger);
 
diff --git a/src/commands/collectorCommand.ts b/src/commands/collectorCommand.ts
index e7727e2..4cee5ad 100644
--- a/src/commands/collectorCommand.ts
+++ b/src/commands/collectorCommand.ts
@@ -153,6 +153,12 @@ export default {
       .number("page-timeout")
       .default("page-timeout", 0)
       .nargs("page-timeout", 1)
+      .boolean("skip-head-request")
+      .describe(
+        "skip-head-request",
+        "Skip the initial HEAD request and directly attempt to access the resource. Useful when HEAD requests are blocked but the resource is still accessible.",
+      )
+      .default("skip-head-request", false)
       .check((argv) => {
         let invokedAsDefaultCommand = argv._[0] !== collectorCommand;
         let urlPosition = invokedAsDefaultCommand ? 0 : 1;
-- 
GitLab


From 1da312fba7c37b748ac9c8b4cf71c25102811f6f Mon Sep 17 00:00:00 2001
From: Felix Woestmann <felix.wostmann@edps.europa.eu>
Date: Wed, 18 Dec 2024 12:05:12 +0100
Subject: [PATCH 2/6] feature: the HEAD request checking the MIME type before
 browsing a website will be skipped when --no-skip-head option is provided

---
 src/collector/page-session.ts | 82 +++++++++++++++++++++++++----------
 src/lib/proxy_config.ts       | 14 +++---
 2 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/src/collector/page-session.ts b/src/collector/page-session.ts
index 17c5cb2..84e7e01 100644
--- a/src/collector/page-session.ts
+++ b/src/collector/page-session.ts
@@ -14,7 +14,10 @@ import {
 } from "../lib/tools.js";
 import parseContentSecurityPolicy from "content-security-policy-parser";
 import sampleSize from "lodash/sampleSize.js";
-import { getGotProxyConfiguration } from "../lib/proxy_config.js";
+import {
+  getGotProxyConfiguration,
+  GotProxyConfiguration,
+} from "../lib/proxy_config.js";
 import got from "got";
 import { BrowserSession, Hosts } from "./browser-session.js";
 
@@ -199,6 +202,53 @@ export class PageSession {
     }
   }
 
+  async shouldSkipLink(
+    link: string,
+    proxyConfig: undefined | GotProxyConfiguration,
+  ): Promise<boolean> {
+    if (this.browserSession.browserArgs.skipHeadRequest) {
+      return false;
+    }
+
+    try {
+      // check mime-type and skip if not html
+      const head = await got(link, {
+        method: "HEAD",
+        throwHttpErrors: false,
+        // ignore Error: unable to verify the first certificate (https://stackoverflow.com/a/36194483)
+        // certificate errors should be checked in the context of the browsing and not during the mime-type check
+        https: {
+          rejectUnauthorized: false,
+        },
+        ...(proxyConfig && { agent: proxyConfig }),
+      });
+
+      if (!head.ok) {
+        this.browserSession.logger
+          .warn(`Fetching the HEAD for ${link} unexpectedly returned HTTP status code ${head.statusCode}.
+            The page will be skipped. Use --skip-head-request option to disable the check.`);
+        return true;
+      }
+
+      if (!head.headers["content-type"].startsWith("text/html")) {
+        this.browserSession.logger.log(
+          "info",
+          `skipping now ${link} of mime-type ${head["content-type"]}`,
+          { type: "Browser" },
+        );
+        return true;
+      }
+
+      return false;
+    } catch (error) {
+      this.browserSession.logger.error(
+        `An error occurred while checking if ${link} should be skipped.`,
+        error.message,
+      );
+      return true;
+    }
+  }
+
   async browseSamples(
     page: Page,
     localStorage,
@@ -227,31 +277,15 @@ export class PageSession {
 
     for (const link of browsing_history.slice(1)) {
       // can have zero iterations!
-      try {
-        // check mime-type and skip if not html
-        const head = await got(link, {
-          method: "HEAD",
-          // ignore Error: unable to verify the first certificate (https://stackoverflow.com/a/36194483)
-          // certificate errors should be checked in the context of the browsing and not during the mime-type check
-          https: {
-            rejectUnauthorized: false,
-          },
-          ...(proxyConfig && { agent: proxyConfig }),
-        });
-
-        if (!head.headers["content-type"].startsWith("text/html")) {
-          this.browserSession.logger.log(
-            "info",
-            `skipping now ${link} of mime-type ${head["content-type"]}`,
-            { type: "Browser" },
-          );
-          continue;
-        }
+      if (await this.shouldSkipLink(link, proxyConfig)) {
+        continue;
+      }
 
-        this.browserSession.logger.log("info", `browsing now to ${link}`, {
-          type: "Browser",
-        });
+      this.browserSession.logger.log("info", `browsing now to ${link}`, {
+        type: "Browser",
+      });
 
+      try {
         await page.goto(link, {
           timeout: this.browserSession.browserArgs.pageLoadTimeout,
           waitUntil: "networkidle2",
diff --git a/src/lib/proxy_config.ts b/src/lib/proxy_config.ts
index f0022ad..613e34f 100644
--- a/src/lib/proxy_config.ts
+++ b/src/lib/proxy_config.ts
@@ -71,12 +71,14 @@ export function getChromiumProxyConfiguration(logger: Logger): string | null {
   return chromiumProxyConfiguration;
 }
 
-export function getGotProxyConfiguration(logger: Logger):
-  | undefined
-  | {
-      http: HttpProxyAgent | undefined;
-      https: HttpsProxyAgent | undefined;
-    } {
+export type GotProxyConfiguration = {
+  http: HttpProxyAgent | undefined;
+  https: HttpsProxyAgent | undefined;
+};
+
+export function getGotProxyConfiguration(
+  logger: Logger,
+): undefined | GotProxyConfiguration {
   loadProxyConfiguration(logger);
 
   if (proxyConfigInstance == null) {
-- 
GitLab


From 7301729de37d0ed9c0da27a620c2fd011f701f0d Mon Sep 17 00:00:00 2001
From: Felix Woestmann <felix.wostmann@edps.europa.eu>
Date: Wed, 18 Dec 2024 12:12:31 +0100
Subject: [PATCH 3/6] feature: only add links to the history that have actually
 been browsed

---
 src/collector/page-session.ts | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/collector/page-session.ts b/src/collector/page-session.ts
index 84e7e01..72971c5 100644
--- a/src/collector/page-session.ts
+++ b/src/collector/page-session.ts
@@ -253,14 +253,17 @@ export class PageSession {
     page: Page,
     localStorage,
     root_uri,
+    // All links collected from the website and filtered for those that are considered first-party
     firstPartyLinks,
-    userSet,
+    // Links provided by the user which have to be browsed
+    userSetLinks,
   ) {
-    const preset_links = [page.url(), ...userSet];
+    const preset_links = [page.url(), ...userSetLinks];
     const extra_links = firstPartyLinks
       .map((l) => l.href)
       .filter((l) => !preset_links.includes(l));
-    const random_links = this.browserSession.browserArgs.seed
+
+    const randomLinks = this.browserSession.browserArgs.seed
       ? sampleSizeSeeded(
           extra_links,
           this.browserSession.browserArgs.linkLimit - preset_links.length,
@@ -271,11 +274,13 @@ export class PageSession {
           this.browserSession.browserArgs.linkLimit - preset_links.length,
         ); // can be empty!
 
-    const browsing_history = [root_uri, ...userSet, ...random_links];
+    let linksToBrowse = [...userSetLinks, ...randomLinks];
+
+    let linksBrowsed = [];
 
     let proxyConfig = getGotProxyConfiguration(this.browserSession.logger);
 
-    for (const link of browsing_history.slice(1)) {
+    for (const link of linksToBrowse) {
       // can have zero iterations!
       if (await this.shouldSkipLink(link, proxyConfig)) {
         continue;
@@ -285,6 +290,8 @@ export class PageSession {
         type: "Browser",
       });
 
+      linksBrowsed.push(link);
+
       try {
         await page.goto(link, {
           timeout: this.browserSession.browserArgs.pageLoadTimeout,
@@ -307,7 +314,7 @@ export class PageSession {
       );
     }
 
-    return browsing_history;
+    return [root_uri, ...linksBrowsed];
   }
 
   async takeScreenshots() {
-- 
GitLab


From 9b51aebf0b6fd43630bb68f9528c68ce7216c46d Mon Sep 17 00:00:00 2001
From: Felix Woestmann <felix.wostmann@edps.europa.eu>
Date: Wed, 18 Dec 2024 12:26:02 +0100
Subject: [PATCH 4/6] refactor: reuse gotoPage function

---
 src/collector/index.ts        |  4 ++++
 src/collector/page-session.ts | 23 +++++++++--------------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/collector/index.ts b/src/collector/index.ts
index 7673f6e..e64f9ef 100644
--- a/src/collector/index.ts
+++ b/src/collector/index.ts
@@ -131,6 +131,10 @@ export class Collector {
 
     const response = await this.pageSession.gotoPage(url);
 
+    if (response == null) {
+      process.exit(2);
+    }
+
     // log redirects
     this.output.uri_redirects = response
       .request()
diff --git a/src/collector/page-session.ts b/src/collector/page-session.ts
index 72971c5..209ab48 100644
--- a/src/collector/page-session.ts
+++ b/src/collector/page-session.ts
@@ -1,4 +1,4 @@
-import { Page } from "puppeteer";
+import { HTTPResponse, Page } from "puppeteer";
 import url from "url";
 import escapeRegExp from "lodash/escapeRegExp.js";
 import { CookieRecorder } from "./recorder/cookie-recorder.js";
@@ -179,17 +179,18 @@ export class PageSession {
     });
   }
 
-  async gotoPage(u) {
-    this.browserSession.logger.log("info", `browsing now to ${u}`, {
+  async gotoPage(url): Promise<HTTPResponse | null> {
+    this.browserSession.logger.log("info", `browsing now to ${url}`, {
       type: "Browser",
     });
 
     try {
-      let page_response = await this.page.goto(u, {
+      let page_response = await this.page.goto(url, {
         timeout: this.browserSession.browserArgs.pageLoadTimeout,
         waitUntil: "networkidle2",
       });
       if (page_response === null) {
+        // https://github.com/puppeteer/puppeteer/issues/2479#issuecomment-408263504
         page_response = await this.page.waitForResponse(() => true);
       }
 
@@ -198,7 +199,8 @@ export class PageSession {
       this.browserSession.logger.log("error", error.message, {
         type: "Browser",
       });
-      process.exit(2);
+
+      return null;
     }
   }
 
@@ -292,15 +294,8 @@ export class PageSession {
 
       linksBrowsed.push(link);
 
-      try {
-        await page.goto(link, {
-          timeout: this.browserSession.browserArgs.pageLoadTimeout,
-          waitUntil: "networkidle2",
-        });
-      } catch (error) {
-        this.browserSession.logger.log("warn", error.message, {
-          type: "Browser",
-        });
+      let response = await this.gotoPage(link);
+      if (response == null) {
         continue;
       }
 
-- 
GitLab


From 910b84284f6250008be46ede1fd9528ea5ff8d57 Mon Sep 17 00:00:00 2001
From: Felix Woestmann <felix.wostmann@edps.europa.eu>
Date: Wed, 18 Dec 2024 12:52:02 +0100
Subject: [PATCH 5/6] fix: error in .gitlab-ci.yml

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 297cba8..2a63592 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -21,7 +21,7 @@ build-job:
     - cd build/
     - npm pack --pack-destination='../artifacts'
     - cd ../artifacts
-    - mv website-evidence-collector-$(PACKAGE_VERSION).tgz website-evidence-collector.tgz
+    - mv website-evidence-collector-$PACKAGE_VERSION.tgz website-evidence-collector.tgz
   artifacts:
     paths:
       - build/artifacts
-- 
GitLab


From 8e0c56c96720bdaa96c15ffc2fc960337e4e663e Mon Sep 17 00:00:00 2001
From: Felix Woestmann <felix.wostmann@edps.europa.eu>
Date: Wed, 15 Jan 2025 12:26:44 +0100
Subject: [PATCH 6/6] update: CHANGELOG.md

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 307726c..11c6e46 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## HEAD
+- **Added option --skip-head-request**. When surfing additional links, the software checks the MIME type using a HEAD request to skip PDFs, videos, etc. With this new option enabled,the HEAD request is skipped. This option is intended for cases where a HEAD request fails (e.g., when blocked) but a normal request would succeed.
+- The browsing history in the report now only includes sites actually visited by the software. Previously, it also included sites that were filtered out due to their MIME type.
+
 ## 3.0.0 / 2024-11-27
 - BREAKING CHANGES:
   - Delete `reporter` script and move functionality to subcommand of WEC reachable under `website-evidence-collector reporter`
-- 
GitLab