diff --git a/20231023-linearSVC_cpv_division_classifier_from_all_EN_notices.ipynb b/20231023-linearSVC_cpv_division_classifier_from_all_EN_notices.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..35fa23d3edd262b94058b49dbe6f1c2341a5c045
--- /dev/null
+++ b/20231023-linearSVC_cpv_division_classifier_from_all_EN_notices.ipynb
@@ -0,0 +1,1206 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "88df97b8-0aba-49e0-8dd9-db5ca675add2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: pandas in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (2.0.3)\n",
+      "Collecting scikit-learn==1.2.2\n",
+      "  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.6/9.6 MB\u001b[0m \u001b[31m97.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hCollecting spacy==3.5.0\n",
+      "  Downloading spacy-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m44.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting imblearn\n",
+      "  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)\n",
+      "Requirement already satisfied: unidecode in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (1.3.6)\n",
+      "Collecting optuna\n",
+      "  Obtaining dependency information for optuna from https://files.pythonhosted.org/packages/05/3c/e9715756751e56f7df4b64c999650f418f6b48f73a824bbfe8e3604385e2/optuna-3.4.0-py3-none-any.whl.metadata\n",
+      "  Downloading optuna-3.4.0-py3-none-any.whl.metadata (17 kB)\n",
+      "Requirement already satisfied: numpy>=1.17.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from scikit-learn==1.2.2) (1.22.3)\n",
+      "Requirement already satisfied: scipy>=1.3.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from scikit-learn==1.2.2) (1.11.1)\n",
+      "Requirement already satisfied: joblib>=1.1.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from scikit-learn==1.2.2) (1.3.0)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from scikit-learn==1.2.2) (3.2.0)\n",
+      "Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy==3.5.0)\n",
+      "  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)\n",
+      "Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for spacy-loggers<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl.metadata\n",
+      "  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)\n",
+      "Collecting murmurhash<1.1.0,>=0.28.0 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for murmurhash<1.1.0,>=0.28.0 from https://files.pythonhosted.org/packages/a8/ca/359ae4246cccaf3f6386b66bd9ba4a39e6ec342f89e2c4def361a8cbe7cf/murmurhash-1.0.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Downloading murmurhash-1.0.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)\n",
+      "Collecting cymem<2.1.0,>=2.0.2 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for cymem<2.1.0,>=2.0.2 from https://files.pythonhosted.org/packages/e9/13/3bed1a1d1cce7937eb797d760c0cca973dbdc1891ad7e2f066ae418fd697/cymem-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Downloading cymem-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)\n",
+      "Collecting preshed<3.1.0,>=3.0.2 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for preshed<3.1.0,>=3.0.2 from https://files.pythonhosted.org/packages/42/59/8f65ad22c13020ff281529e415c32a56cfa691d24b0eca2eb3d756e4d644/preshed-3.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Downloading preshed-3.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)\n",
+      "Collecting thinc<8.2.0,>=8.1.0 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for thinc<8.2.0,>=8.1.0 from https://files.pythonhosted.org/packages/d7/fc/2ea1a37a60ad1c7b9f41699ccd29170f6d479d3349e6742503278b4bc811/thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\n",
+      "Collecting wasabi<1.2.0,>=0.9.1 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for wasabi<1.2.0,>=0.9.1 from https://files.pythonhosted.org/packages/8f/69/26cbf0bad11703241cb84d5324d868097f7a8faf2f1888354dac8883f3fc/wasabi-1.1.2-py3-none-any.whl.metadata\n",
+      "  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)\n",
+      "Collecting srsly<3.0.0,>=2.4.3 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for srsly<3.0.0,>=2.4.3 from https://files.pythonhosted.org/packages/32/69/2c054c6c5dc5daf5648f994f22377f3be44f79d643f3c3db255b4e86b391/srsly-2.4.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Downloading srsly-2.4.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
+      "Collecting catalogue<2.1.0,>=2.0.6 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for catalogue<2.1.0,>=2.0.6 from https://files.pythonhosted.org/packages/9e/96/d32b941a501ab566a16358d68b6eb4e4acc373fab3c3c4d7d9e649f7b4bb/catalogue-2.0.10-py3-none-any.whl.metadata\n",
+      "  Downloading catalogue-2.0.10-py3-none-any.whl.metadata (14 kB)\n",
+      "Collecting typer<0.8.0,>=0.3.0 (from spacy==3.5.0)\n",
+      "  Downloading typer-0.7.0-py3-none-any.whl (38 kB)\n",
+      "Collecting pathy>=0.10.0 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for pathy>=0.10.0 from https://files.pythonhosted.org/packages/0e/6b/d64babaaeaea0311e55a193d6385bcd2b342e30158ce336cbc05eae7fec6/pathy-0.10.3-py3-none-any.whl.metadata\n",
+      "  Downloading pathy-0.10.3-py3-none-any.whl.metadata (16 kB)\n",
+      "Collecting smart-open<7.0.0,>=5.2.1 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for smart-open<7.0.0,>=5.2.1 from https://files.pythonhosted.org/packages/fc/d9/d97f1db64b09278aba64e8c81b5d322d436132df5741c518f3823824fae0/smart_open-6.4.0-py3-none-any.whl.metadata\n",
+      "  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy==3.5.0) (4.65.0)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy==3.5.0) (2.31.0)\n",
+      "Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 (from spacy==3.5.0)\n",
+      "  Obtaining dependency information for pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 from https://files.pythonhosted.org/packages/e0/2f/d6f17f8385d718233bcae893d27525443d41201c938b68a4af3d591a33e4/pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Downloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (149 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m40.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy==3.5.0) (3.1.2)\n",
+      "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy==3.5.0) (68.0.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy==3.5.0) (21.3)\n",
+      "Collecting langcodes<4.0.0,>=3.2.0 (from spacy==3.5.0)\n",
+      "  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.6/181.6 kB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pandas) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pandas) (2023.3)\n",
+      "Requirement already satisfied: tzdata>=2022.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pandas) (2023.3)\n",
+      "Collecting imbalanced-learn (from imblearn)\n",
+      "  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/a3/9e/fbe60a768502af54563dcb59ca7856f5a8833b3ad5ada658922e1ab09b7f/imbalanced_learn-0.11.0-py3-none-any.whl.metadata\n",
+      "  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)\n",
+      "Collecting alembic>=1.5.0 (from optuna)\n",
+      "  Obtaining dependency information for alembic>=1.5.0 from https://files.pythonhosted.org/packages/a2/8b/46919127496036c8e990b2b236454a0d8655fd46e1df2fd35610a9cbc842/alembic-1.12.0-py3-none-any.whl.metadata\n",
+      "  Downloading alembic-1.12.0-py3-none-any.whl.metadata (7.2 kB)\n",
+      "Collecting colorlog (from optuna)\n",
+      "  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)\n",
+      "Requirement already satisfied: sqlalchemy>=1.3.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from optuna) (2.0.19)\n",
+      "Requirement already satisfied: PyYAML in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from optuna) (6.0)\n",
+      "Collecting Mako (from alembic>=1.5.0->optuna)\n",
+      "  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.7/78.7 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: typing-extensions>=4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from alembic>=1.5.0->optuna) (4.7.1)\n",
+      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from packaging>=20.0->spacy==3.5.0) (3.0.9)\n",
+      "Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.5.0) (3.2.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.5.0) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.5.0) (1.26.14)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy==3.5.0) (2023.5.7)\n",
+      "Requirement already satisfied: greenlet!=0.4.17 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from sqlalchemy>=1.3.0->optuna) (2.0.2)\n",
+      "Collecting blis<0.8.0,>=0.7.8 (from thinc<8.2.0,>=8.1.0->spacy==3.5.0)\n",
+      "  Obtaining dependency information for blis<0.8.0,>=0.7.8 from https://files.pythonhosted.org/packages/9b/81/55092e1c016fe05ef7a57623920209012f05e8b897acbad355c9bf854181/blis-0.7.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata\n",
+      "  Downloading blis-0.7.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n",
+      "Collecting confection<1.0.0,>=0.0.1 (from thinc<8.2.0,>=8.1.0->spacy==3.5.0)\n",
+      "  Obtaining dependency information for confection<1.0.0,>=0.0.1 from https://files.pythonhosted.org/packages/93/f8/e89268a1f885048fb2ee6b5c9f93c4e90de768534acfef3652f87d97d4cb/confection-0.1.3-py3-none-any.whl.metadata\n",
+      "  Downloading confection-0.1.3-py3-none-any.whl.metadata (19 kB)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy==3.5.0) (8.1.6)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from jinja2->spacy==3.5.0) (2.1.3)\n",
+      "Downloading optuna-3.4.0-py3-none-any.whl (409 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m409.6/409.6 kB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading alembic-1.12.0-py3-none-any.whl (226 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.0/226.0 kB\u001b[0m \u001b[31m53.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading catalogue-2.0.10-py3-none-any.whl (17 kB)\n",
+      "Downloading cymem-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (46 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.1/46.1 kB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading murmurhash-1.0.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29 kB)\n",
+      "Downloading pathy-0.10.3-py3-none-any.whl (48 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.9/48.9 kB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading preshed-3.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (156 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m156.9/156.9 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m122.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading smart_open-6.4.0-py3-none-any.whl (57 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.0/57.0 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB)\n",
+      "Downloading srsly-2.4.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m493.0/493.0 kB\u001b[0m \u001b[31m70.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (919 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m919.6/919.6 kB\u001b[0m \u001b[31m100.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading wasabi-1.1.2-py3-none-any.whl (27 kB)\n",
+      "Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.6/235.6 kB\u001b[0m \u001b[31m56.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading blis-0.7.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.2/10.2 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hDownloading confection-0.1.3-py3-none-any.whl (34 kB)\n",
+      "Installing collected packages: cymem, wasabi, typer, spacy-loggers, spacy-legacy, smart-open, pydantic, murmurhash, Mako, langcodes, colorlog, catalogue, blis, srsly, scikit-learn, preshed, pathy, alembic, optuna, imbalanced-learn, confection, thinc, imblearn, spacy\n",
+      "  Attempting uninstall: scikit-learn\n",
+      "    Found existing installation: scikit-learn 1.3.0\n",
+      "    Uninstalling scikit-learn-1.3.0:\n",
+      "      Successfully uninstalled scikit-learn-1.3.0\n",
+      "Successfully installed Mako-1.2.4 alembic-1.12.0 blis-0.7.11 catalogue-2.0.10 colorlog-6.7.0 confection-0.1.3 cymem-2.0.8 imbalanced-learn-0.11.0 imblearn-0.0 langcodes-3.3.0 murmurhash-1.0.10 optuna-3.4.0 pathy-0.10.3 preshed-3.0.9 pydantic-1.10.13 scikit-learn-1.2.2 smart-open-6.4.0 spacy-3.5.0 spacy-legacy-3.0.12 spacy-loggers-1.0.5 srsly-2.4.8 thinc-8.1.12 typer-0.7.0 wasabi-1.1.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip3 install pandas scikit-learn==1.2.2 spacy==3.5.0 imblearn unidecode optuna"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "faaefdac-c206-4e8e-87b2-2de21c1f818a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting en-core-web-sm==3.5.0\n",
+      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m33.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: spacy<3.6.0,>=3.5.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from en-core-web-sm==3.5.0) (3.5.0)\n",
+      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.12)\n",
+      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.5)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.10)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.8)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.9)\n",
+      "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.12)\n",
+      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.1.2)\n",
+      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.4.8)\n",
+      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.10)\n",
+      "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.0)\n",
+      "Requirement already satisfied: pathy>=0.10.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.10.3)\n",
+      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (6.4.0)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.65.0)\n",
+      "Requirement already satisfied: numpy>=1.15.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.22.3)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.31.0)\n",
+      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.10.13)\n",
+      "Requirement already satisfied: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.1.2)\n",
+      "Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (68.0.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (21.3)\n",
+      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.3.0)\n",
+      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from packaging>=20.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.9)\n",
+      "Requirement already satisfied: typing-extensions>=4.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.7.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.2.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.26.14)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2023.5.7)\n",
+      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.11)\n",
+      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.1.3)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.6)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.1.3)\n",
+      "Installing collected packages: en-core-web-sm\n",
+      "Successfully installed en-core-web-sm-3.5.0\n",
+      "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
+      "You can now load the package via spacy.load('en_core_web_sm')\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "from unidecode import unidecode\n",
+    "import spacy.cli\n",
+    "\n",
+    "\n",
+    "spacy.cli.download(\"en_core_web_sm\")\n",
+    "NLP = spacy.load(\"en_core_web_sm\")\n",
+    "STOP_WORDS = NLP.Defaults.stop_words\n",
+    "CHARACTERS_TO_REPLACE = [\"\\\\n\", \"\\\\r\", \"\\\\t\", \"\\\\W\", \"•\", \"\\t\", \"-\", \"(\", \")\", \":\", \";\", \"?\", \"!\", \"&\", \"\\n\", \"\\r\", \".\", \",\", \"'\", \"’\", \"´\",\n",
+    "                         \"‘\", \"’\", '\"', \"“\", \"”\", \"'\", \"/\", \"\\\\\", \"%\", \"—\", \"#\", \"$\", \"[\", \"]\", \"|\", \"{\", \"}\", \"~\", \"`\", \"+\", \"*\"]\n",
+    "\n",
+    "MONTHS = [\" january \", \" february \", \" march \", \" april \", \" may \", \" june \", \" july \", \" august \", \" september \", \" october \", \" november \", \" december \",\n",
+    "          \" jan \", \" feb \", \" mar \", \" apr \", \" jun \", \" jul \", \" aug \", \" sep \", \" oct \", \" nov \", \" dec \"]\n",
+    "\n",
+    "          \n",
+    "def _remove_multiple_spaces(text: str) -> str:\n",
+    "    return re.sub('\\s+', ' ', text)\n",
+    "\n",
+    "\n",
+    "def _remove_special_characters(text: str) -> str:\n",
+    "    for chars in CHARACTERS_TO_REPLACE:\n",
+    "        text = text.replace(chars, \" \")\n",
+    "    return text\n",
+    "\n",
+    "\n",
+    "def _remove_stop_words(text: str) -> str:\n",
+    "    doc = NLP(text)\n",
+    "    token_list = [token.text.lower().strip() for token in doc]\n",
+    "    removed_list = [x for x in token_list if x not in STOP_WORDS]\n",
+    "    return ' '.join(removed_list)\n",
+    "\n",
+    "\n",
+    "def _replace_digits(text):\n",
+    "    return re.sub(r'[\\d-]+', 'NUMBER', text)\n",
+    "\n",
+    "\n",
+    "def _delete_one_letter_word(text):\n",
+    "    text_as_list = text.split()\n",
+    "    text_as_list = [element for element in text_as_list if len(element) > 1]\n",
+    "    return ' '.join(text_as_list)\n",
+    "\n",
+    "\n",
+    "def _remove_consecutive_duplicates(text):\n",
+    "    text_as_list = text.split()\n",
+    "    last_seen = None\n",
+    "    result = []\n",
+    "    for x in text_as_list:\n",
+    "        if x != last_seen:\n",
+    "            result.append(x)\n",
+    "        last_seen = x\n",
+    "    return ' '.join(result)\n",
+    "\n",
+    "          \n",
+    "def _replace_months(text: str) -> str:\n",
+    "    text = \" \" + text + \" \"\n",
+    "    for month in MONTHS:\n",
+    "        text = text.replace(month, \" MONTH \")\n",
+    "    return text\n",
+    "\n",
+    "\n",
+    "def _replace_with_lemma(text: str) -> str:\n",
+    "    doc = NLP(text)\n",
+    "    lemmatized_list = []\n",
+    "    for token in doc:\n",
+    "        lemmatized_list.append(token.lemma_)\n",
+    "    return \" \".join(lemmatized_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7acb12c5-c374-4062-a349-d95985841eaa",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from json import JSONEncoder\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.multiclass import OneVsRestClassifier\n",
+    "from sklearn.metrics import f1_score, roc_auc_score, accuracy_score\n",
+    "from sklearn.metrics import coverage_error\n",
+    "from sklearn.metrics import label_ranking_average_precision_score\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.metrics import f1_score, roc_auc_score, accuracy_score\n",
+    "from sklearn.metrics import coverage_error\n",
+    "from sklearn.metrics import label_ranking_average_precision_score\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "from sklearn.svm import LinearSVC\n",
+    "\n",
+    "\n",
+    "# adapted from: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/\n",
+    "def multi_label_metrics(ytest,y_pred):\n",
+    "    y_true = ytest\n",
+    "    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')\n",
+    "    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')\n",
+    "    accuracy = accuracy_score(y_true, y_pred)\n",
+    "    coverage_err = coverage_error(y_true, y_pred)\n",
+    "    label_ranking_average_precision = label_ranking_average_precision_score(y_true, y_pred)\n",
+    "    metrics = {'f1': f1_micro_average,\n",
+    "               'roc_auc': roc_auc,\n",
+    "               'accuracy': accuracy,\n",
+    "               'coverage_error': coverage_err,\n",
+    "               'label_ranking_average_precision_score': label_ranking_average_precision}\n",
+    "    print(metrics)\n",
+    "    return f1_micro_average\n",
+    "\n",
+    "\n",
+    "class NumpyArrayEncoder(JSONEncoder):\n",
+    "    def default(self, obj):\n",
+    "        if isinstance(obj, np.ndarray):\n",
+    "            return obj.tolist()\n",
+    "        return JSONEncoder.default(self, obj)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20c2fd71",
+   "metadata": {},
+   "source": [
+    "Dataset available at s3://d-ew1-ted-ai-ml-data/experiments/roberta_cpv_v0.0.1/20231020-dataset_formatted_with_title_and_short_desription_combined-all-EN-notices-ted_with_preprocessing.csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ab8bb558-7ece-4e80-942d-33fea65e1a0e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>year</th>\n",
+       "      <th>title_texte</th>\n",
+       "      <th>85</th>\n",
+       "      <th>44</th>\n",
+       "      <th>50</th>\n",
+       "      <th>80</th>\n",
+       "      <th>73</th>\n",
+       "      <th>45</th>\n",
+       "      <th>71</th>\n",
+       "      <th>79</th>\n",
+       "      <th>...</th>\n",
+       "      <th>03</th>\n",
+       "      <th>24</th>\n",
+       "      <th>43</th>\n",
+       "      <th>19</th>\n",
+       "      <th>41</th>\n",
+       "      <th>37</th>\n",
+       "      <th>14</th>\n",
+       "      <th>16</th>\n",
+       "      <th>76</th>\n",
+       "      <th>processed_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2020</td>\n",
+       "      <td>CRH0485 Biomass Boiler Systems Fuel Supply Mai...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>crhNUMBER biomass boiler systems fuel supply m...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2021</td>\n",
+       "      <td>Extension to Cullahill Community Centre Two st...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>extension cullahill community centre storey ex...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2018</td>\n",
+       "      <td>Supply and Fit and Supply only UPVC Doors and ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>supply fit supply upvc doors windows appointme...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>2018</td>\n",
+       "      <td>The Supply of Materials and Associated Managed...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>supply materials associated managed services p...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>2018</td>\n",
+       "      <td>Catalyst Building Shell and Core Works Constru...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>catalyst building shell core works constructio...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 48 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   year                                        title_texte     85     44  \\\n",
+       "0  2020  CRH0485 Biomass Boiler Systems Fuel Supply Mai...  False  False   \n",
+       "2  2021  Extension to Cullahill Community Centre Two st...  False  False   \n",
+       "4  2018  Supply and Fit and Supply only UPVC Doors and ...  False   True   \n",
+       "5  2018  The Supply of Materials and Associated Managed...  False   True   \n",
+       "6  2018  Catalyst Building Shell and Core Works Constru...  False  False   \n",
+       "\n",
+       "      50     80     73     45     71     79  ...     03     24     43     19  \\\n",
+       "0   True  False  False  False  False  False  ...  False  False  False  False   \n",
+       "2  False  False  False   True  False  False  ...  False  False  False  False   \n",
+       "4  False  False  False   True  False  False  ...  False  False  False  False   \n",
+       "5  False  False  False   True  False  False  ...   True   True   True  False   \n",
+       "6  False  False  False   True  False  False  ...  False  False  False  False   \n",
+       "\n",
+       "      41     37     14     16     76  \\\n",
+       "0  False  False  False  False  False   \n",
+       "2  False  False  False  False  False   \n",
+       "4  False  False  False  False  False   \n",
+       "5  False  False  False  False  False   \n",
+       "6  False  False  False  False  False   \n",
+       "\n",
+       "                                      processed_text  \n",
+       "0  crhNUMBER biomass boiler systems fuel supply m...  \n",
+       "2  extension cullahill community centre storey ex...  \n",
+       "4  supply fit supply upvc doors windows appointme...  \n",
+       "5  supply materials associated managed services p...  \n",
+       "6  catalyst building shell core works constructio...  \n",
+       "\n",
+       "[5 rows x 48 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import joblib\n",
+    "df = pd.read_csv(\"20231020-dataset_formatted_with_title_and_short_desription_combined-all-EN-notices-ted_with_preprocessing.csv\", index_col=0)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6f39c09b-fef8-492e-a27e-5e08754a38f7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>year</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>264874.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>2018.005618</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>2.576416</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>2014.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>2016.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>2018.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>2020.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>2023.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                year\n",
+       "count  264874.000000\n",
+       "mean     2018.005618\n",
+       "std         2.576416\n",
+       "min      2014.000000\n",
+       "25%      2016.000000\n",
+       "50%      2018.000000\n",
+       "75%      2020.000000\n",
+       "max      2023.000000"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ec5d4867-1e4e-4a35-9ae5-bc07f4d77253",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title_texte</th>\n",
+       "      <th>85</th>\n",
+       "      <th>44</th>\n",
+       "      <th>50</th>\n",
+       "      <th>80</th>\n",
+       "      <th>73</th>\n",
+       "      <th>45</th>\n",
+       "      <th>71</th>\n",
+       "      <th>79</th>\n",
+       "      <th>90</th>\n",
+       "      <th>...</th>\n",
+       "      <th>18</th>\n",
+       "      <th>03</th>\n",
+       "      <th>24</th>\n",
+       "      <th>43</th>\n",
+       "      <th>19</th>\n",
+       "      <th>41</th>\n",
+       "      <th>37</th>\n",
+       "      <th>14</th>\n",
+       "      <th>16</th>\n",
+       "      <th>76</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>264873</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>...</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "      <td>264874</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>unique</th>\n",
+       "      <td>264873</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>top</th>\n",
+       "      <td>crhNUMBER biomass boiler systems fuel supply m...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>freq</th>\n",
+       "      <td>1</td>\n",
+       "      <td>240626</td>\n",
+       "      <td>253217</td>\n",
+       "      <td>247085</td>\n",
+       "      <td>255181</td>\n",
+       "      <td>256314</td>\n",
+       "      <td>230076</td>\n",
+       "      <td>237177</td>\n",
+       "      <td>230983</td>\n",
+       "      <td>247413</td>\n",
+       "      <td>...</td>\n",
+       "      <td>261753</td>\n",
+       "      <td>262175</td>\n",
+       "      <td>262519</td>\n",
+       "      <td>262520</td>\n",
+       "      <td>263780</td>\n",
+       "      <td>264607</td>\n",
+       "      <td>263271</td>\n",
+       "      <td>263438</td>\n",
+       "      <td>264175</td>\n",
+       "      <td>263982</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>4 rows × 46 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              title_texte      85      44  \\\n",
+       "count                                              264873  264874  264874   \n",
+       "unique                                             264873       2       2   \n",
+       "top     crhNUMBER biomass boiler systems fuel supply m...   False   False   \n",
+       "freq                                                    1  240626  253217   \n",
+       "\n",
+       "            50      80      73      45      71      79      90  ...      18  \\\n",
+       "count   264874  264874  264874  264874  264874  264874  264874  ...  264874   \n",
+       "unique       2       2       2       2       2       2       2  ...       2   \n",
+       "top      False   False   False   False   False   False   False  ...   False   \n",
+       "freq    247085  255181  256314  230076  237177  230983  247413  ...  261753   \n",
+       "\n",
+       "            03      24      43      19      41      37      14      16      76  \n",
+       "count   264874  264874  264874  264874  264874  264874  264874  264874  264874  \n",
+       "unique       2       2       2       2       2       2       2       2       2  \n",
+       "top      False   False   False   False   False   False   False   False   False  \n",
+       "freq    262175  262519  262520  263780  264607  263271  263438  264175  263982  \n",
+       "\n",
+       "[4 rows x 46 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['title_texte'] = df['processed_text']\n",
+    "df = df.drop(['year','processed_text'], axis =1)\n",
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc9e0dd3-826f-4c82-840c-4b77ba88adac",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[I 2023-10-23 10:12:54,833] A new study created in memory with name: no-name-7245d173-c381-40bc-9ae0-61779ddb7063\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 1) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 2) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 3) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:1682: RuntimeWarning: divide by zero encountered in true_divide\n",
+      "  idf = np.log(n_samples / df) + 1\n",
+      "[I 2023-10-23 10:20:42,763] Trial 0 finished with value: 0.7696891850480208 and parameters: {'ngram_range': (1, 3), 'min_df': 1, 'max_df': 0.7186761879881496, 'C': 4.070871372844556}. Best is trial 0 with value: 0.7696891850480208.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'f1': 0.7696891850480208, 'roc_auc': 0.8553740684010729, 'accuracy': 0.6332043416705994, 'coverage_error': 13.564832468145351, 'label_ranking_average_precision_score': 0.7463378951328808}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 1) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 2) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 3) which is of type tuple.\n",
+      "  warnings.warn(message)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn import metrics\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.multiclass import OneVsRestClassifier\n",
+    "import optuna\n",
+    "import json\n",
+    "\n",
+    "ALL_DIVISIONS = ['85', '44', '50', '80', '73', '45', '71', '79', '90', '30', '35', '33', '55', '72', '48', '38', '09', '75', '66', '64', '42', '34', '60', '92', '39', '31', '98', '51', '32', '65', '77', '22', '63', '15', '70', '18', '03', '24', '43', '19', '41', '37', '14', '16', '76']\n",
+    "\n",
+    "\n",
+    "def get_model(tfidf_param={}, linear_svc_param={}):\n",
+    "    model = Pipeline([\n",
+    "                ('tfidf', TfidfVectorizer(**tfidf_param)),\n",
+    "                ('clf', OneVsRestClassifier(LinearSVC(**linear_svc_param))),\n",
+    "            ])\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def objective(trial):\n",
+    "    x = df['title_texte'].values.astype(str)\n",
+    "    y = df[ALL_DIVISIONS]\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)\n",
+    "    tfidf_param = {\n",
+    "        'ngram_range': trial.suggest_categorical('ngram_range', [(1,1),(1,2),(1,3)]),\n",
+    "        'min_df': trial.suggest_int('min_df', 1, 4),\n",
+    "        'max_df': trial.suggest_float('max_df', 0.2, 1.0),\n",
+    "\n",
+    "    }\n",
+    "\n",
+    "    linear_svc_param = {\n",
+    "        'max_iter': 10000,\n",
+    "        'random_state': 736283,\n",
+    "        'C': trial.suggest_float('C', 1, 20),\n",
+    "    }\n",
+    "\n",
+    "    model = get_model(tfidf_param, linear_svc_param)\n",
+    "    model.fit(X_train, y_train)\n",
+    "\n",
+    "    y_pred = model.predict(X_test)\n",
+    "\n",
+    "    f1 = multi_label_metrics(y_test, y_pred) \n",
+    "    return f1\n",
+    "    \n",
+    "\n",
+    "study = optuna.create_study(direction=\"maximize\")\n",
+    "study.optimize(objective, n_trials=50)\n",
+    "print(study.best_trial)\n",
+    "results_df = study.trials_dataframe()\n",
+    "\n",
+    "results_df.to_csv(\"results_optuna_test_size_0.2.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "6763e604-4d5c-4e8f-bce9-0d02999174b1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FrozenTrial(number=15, state=TrialState.COMPLETE, values=[0.7717959753250648], datetime_start=datetime.datetime(2023, 10, 23, 11, 46, 58, 612710), datetime_complete=datetime.datetime(2023, 10, 23, 11, 59, 5, 98837), params={'ngram_range': (1, 3), 'min_df': 1, 'max_df': 0.20631275746084188, 'C': 7.91658266485577}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'ngram_range': CategoricalDistribution(choices=((1, 1), (1, 2), (1, 3))), 'min_df': IntDistribution(high=4, log=False, low=1, step=1), 'max_df': FloatDistribution(high=1.0, log=False, low=0.2, step=None), 'C': FloatDistribution(high=20.0, log=False, low=1.0, step=None)}, trial_id=15, value=None)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(study.best_trial)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e47d835a-4bca-40bf-b925-0ae6dd689a93",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[I 2023-10-23 18:14:56,076] A new study created in memory with name: no-name-da45145b-04c7-48dd-b8e3-740fae921e54\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 4) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:1682: RuntimeWarning: divide by zero encountered in true_divide\n",
+      "  idf = np.log(n_samples / df) + 1\n",
+      "[I 2023-10-23 18:36:01,920] Trial 0 finished with value: 0.7721055280543823 and parameters: {'ngram_range': (1, 4), 'min_df': 1, 'max_df': 0.9852723810399673, 'C': 10.773831912469}. Best is trial 0 with value: 0.7721055280543823.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'f1': 0.7721055280543823, 'roc_auc': 0.8647277722373967, 'accuracy': 0.6296743747050495, 'coverage_error': 12.754412458706938, 'label_ranking_average_precision_score': 0.7507013010372966}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 4) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:1682: RuntimeWarning: divide by zero encountered in true_divide\n",
+      "  idf = np.log(n_samples / df) + 1\n",
+      "[I 2023-10-23 18:55:45,980] Trial 1 finished with value: 0.7684995827042042 and parameters: {'ngram_range': (1, 4), 'min_df': 0, 'max_df': 0.21576007770557593, 'C': 9.69934645009312}. Best is trial 0 with value: 0.7721055280543823.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'f1': 0.7684995827042042, 'roc_auc': 0.8624590842431155, 'accuracy': 0.6257102406795658, 'coverage_error': 12.857461066540822, 'label_ranking_average_precision_score': 0.7478550074933777}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 4) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:1682: RuntimeWarning: divide by zero encountered in true_divide\n",
+      "  idf = np.log(n_samples / df) + 1\n",
+      "[I 2023-10-23 19:00:43,148] Trial 2 finished with value: 0.7459586466165414 and parameters: {'ngram_range': (1, 4), 'min_df': 0, 'max_df': 0.7668359712285692, 'C': 0.6965684555834544}. Best is trial 0 with value: 0.7721055280543823.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'f1': 0.7459586466165414, 'roc_auc': 0.8357195765109585, 'accuracy': 0.5965266635205285, 'coverage_error': 15.051703633789524, 'label_ranking_average_precision_score': 0.7199785739228183}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 4) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:1682: RuntimeWarning: divide by zero encountered in true_divide\n",
+      "  idf = np.log(n_samples / df) + 1\n",
+      "[I 2023-10-23 19:18:23,225] Trial 3 finished with value: 0.7658394830420984 and parameters: {'ngram_range': (1, 4), 'min_df': 1, 'max_df': 0.11192775225799664, 'C': 8.263800897801081}. Best is trial 0 with value: 0.7721055280543823.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'f1': 0.7658394830420984, 'roc_auc': 0.8605805364089498, 'accuracy': 0.6230486078338839, 'coverage_error': 13.015252477583767, 'label_ranking_average_precision_score': 0.745160098868434}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/distributions.py:524: UserWarning: Choices for a categorical distribution should be a tuple of None, bool, int, float and str for persistent storage but contains (1, 4) which is of type tuple.\n",
+      "  warnings.warn(message)\n",
+      "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:1682: RuntimeWarning: divide by zero encountered in true_divide\n",
+      "  idf = np.log(n_samples / df) + 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn import metrics\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.multiclass import OneVsRestClassifier\n",
+    "import optuna\n",
+    "import json\n",
+    "\n",
+    "ALL_DIVISIONS = ['85', '44', '50', '80', '73', '45', '71', '79', '90', '30', '35', '33', '55', '72', '48', '38', '09', '75', '66', '64', '42', '34', '60', '92', '39', '31', '98', '51', '32', '65', '77', '22', '63', '15', '70', '18', '03', '24', '43', '19', '41', '37', '14', '16', '76']\n",
+    "\n",
+    "\n",
+    "def get_model(tfidf_param={}, linear_svc_param={}):\n",
+    "    model = Pipeline([\n",
+    "                ('tfidf', TfidfVectorizer(**tfidf_param)),\n",
+    "                ('clf', OneVsRestClassifier(LinearSVC(**linear_svc_param))),\n",
+    "            ])\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def objective(trial):\n",
+    "    x = df['title_texte'].values.astype(str)\n",
+    "    y = df[ALL_DIVISIONS]\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)\n",
+    "    tfidf_param = {\n",
+    "        'ngram_range': trial.suggest_categorical('ngram_range', [(1,4)]),\n",
+    "        'min_df': trial.suggest_int('min_df', 0, 1),\n",
+    "        'max_df': trial.suggest_float('max_df', 0.05, 1.0),\n",
+    "\n",
+    "    }\n",
+    "\n",
+    "    linear_svc_param = {\n",
+    "        'max_iter': 10000,\n",
+    "        'random_state': 736283,\n",
+    "        'C': trial.suggest_float('C', 0.1, 12),\n",
+    "    }\n",
+    "\n",
+    "    model = get_model(tfidf_param, linear_svc_param)\n",
+    "    model.fit(X_train, y_train)\n",
+    "\n",
+    "    y_pred = model.predict(X_test)\n",
+    "\n",
+    "    f1 = multi_label_metrics(y_test, y_pred) \n",
+    "    return f1\n",
+    "    \n",
+    "\n",
+    "study = optuna.create_study(direction=\"maximize\")\n",
+    "study.optimize(objective, n_trials=15, n_jobs=1)\n",
+    "print(study.best_trial)\n",
+    "results_df = study.trials_dataframe()\n",
+    "\n",
+    "results_df.to_csv(\"results_optuna_test_size_0.2_1_4.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b06a8569-8d42-4e92-ada2-373de3803abc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "print(study.best_trial)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6bd74a67-f91b-4622-addd-00ed09852fe9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn import metrics\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.multiclass import OneVsRestClassifier\n",
+    "import optuna\n",
+    "import json\n",
+    "\n",
+    "ALL_DIVISIONS = ['85', '44', '50', '80', '73', '45', '71', '79', '90', '30', '35', '33', '55', '72', '48', '38', '09', '75', '66', '64', '42', '34', '60', '92', '39', '31', '98', '51', '32', '65', '77', '22', '63', '15', '70', '18', '03', '24', '43', '19', '41', '37', '14', '16', '76']\n",
+    "\n",
+    "\n",
+    "def get_model(tfidf_param={}, linear_svc_param={}):\n",
+    "    model = Pipeline([\n",
+    "                ('tfidf', TfidfVectorizer(**tfidf_param)),\n",
+    "                ('clf', OneVsRestClassifier(LinearSVC(**linear_svc_param))),\n",
+    "            ])\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "def objective(trial):\n",
+    "    x = df['title_texte'].values.astype(str)\n",
+    "    y = df[ALL_DIVISIONS]\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)\n",
+    "    tfidf_param = {\n",
+    "        'ngram_range': trial.suggest_categorical('ngram_range', [(1,3)]),\n",
+    "        'min_df': trial.suggest_int('min_df', 0, 1),\n",
+    "        'max_df': trial.suggest_float('max_df', 0.05, 1.0),\n",
+    "\n",
+    "    }\n",
+    "\n",
+    "    linear_svc_param = {\n",
+    "        'max_iter': 10000,\n",
+    "        'random_state': 736283,\n",
+    "        'C': trial.suggest_float('C', 0.1, 12),\n",
+    "    }\n",
+    "\n",
+    "    model = get_model(tfidf_param, linear_svc_param)\n",
+    "    model.fit(X_train, y_train)\n",
+    "\n",
+    "    y_pred = model.predict(X_test)\n",
+    "\n",
+    "    f1 = multi_label_metrics(y_test, y_pred) \n",
+    "    return f1\n",
+    "    \n",
+    "\n",
+    "study = optuna.create_study(direction=\"maximize\")\n",
+    "study.optimize(objective, n_trials=50, n_jobs=2)\n",
+    "print(study.best_trial)\n",
+    "results_df = study.trials_dataframe()\n",
+    "\n",
+    "results_df.to_csv(\"results_optuna_test_size_0.2_1_3_full_C.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "1a488b5e-fb7e-431a-93d5-5a78bfbaba39",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FrozenTrial(number=28, state=TrialState.COMPLETE, values=[0.7730330676439546], datetime_start=datetime.datetime(2023, 10, 24, 1, 5, 25, 456706), datetime_complete=datetime.datetime(2023, 10, 24, 1, 21, 37, 739041), params={'ngram_range': (1, 3), 'min_df': 0, 'max_df': 0.16415117211829994, 'C': 9.91716546491975}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'ngram_range': CategoricalDistribution(choices=((1, 3),)), 'min_df': IntDistribution(high=1, log=False, low=0, step=1), 'max_df': FloatDistribution(high=1.0, log=False, low=0.05, step=None), 'C': FloatDistribution(high=12.0, log=False, low=0.1, step=None)}, trial_id=28, value=None)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(study.best_trial)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_python3",
+   "language": "python",
+   "name": "conda_python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/20231025-roberta_finetune_with_full_EN_notices.ipynb b/20231025-roberta_finetune_with_full_EN_notices.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..904d6fc9a7fe3cbdab04c6e9968b5eee047d89b6
--- /dev/null
+++ b/20231025-roberta_finetune_with_full_EN_notices.ipynb
@@ -0,0 +1,1083 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c29513aa-884a-40bb-b39d-f7bb8e3717c0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q transformers datasets\n",
+    "!pip install -q transformers[torch]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f58f73ec",
+   "metadata": {},
+   "source": [
+    "Dataset is available at s3://d-ew1-ted-ai-ml-data/experiments/roberta_cpv_v0.0.1/20231020-dataset_formatted_with_title_and_short_desription_combined-all-EN-notices-ted_with_preprocessing.csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "383ae363-a974-4b31-b68e-6bffb1f5fc41",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>year</th>\n",
+       "      <th>title_texte</th>\n",
+       "      <th>85</th>\n",
+       "      <th>44</th>\n",
+       "      <th>50</th>\n",
+       "      <th>80</th>\n",
+       "      <th>73</th>\n",
+       "      <th>45</th>\n",
+       "      <th>71</th>\n",
+       "      <th>79</th>\n",
+       "      <th>...</th>\n",
+       "      <th>03</th>\n",
+       "      <th>24</th>\n",
+       "      <th>43</th>\n",
+       "      <th>19</th>\n",
+       "      <th>41</th>\n",
+       "      <th>37</th>\n",
+       "      <th>14</th>\n",
+       "      <th>16</th>\n",
+       "      <th>76</th>\n",
+       "      <th>processed_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2020</td>\n",
+       "      <td>CRH0485 Biomass Boiler Systems Fuel Supply Mai...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>crhNUMBER biomass boiler systems fuel supply m...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2021</td>\n",
+       "      <td>Extension to Cullahill Community Centre Two st...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>extension cullahill community centre storey ex...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2018</td>\n",
+       "      <td>Supply and Fit and Supply only UPVC Doors and ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>supply fit supply upvc doors windows appointme...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>2018</td>\n",
+       "      <td>The Supply of Materials and Associated Managed...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>supply materials associated managed services p...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>2018</td>\n",
+       "      <td>Catalyst Building Shell and Core Works Constru...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>catalyst building shell core works constructio...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 48 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   year                                        title_texte     85     44  \\\n",
+       "0  2020  CRH0485 Biomass Boiler Systems Fuel Supply Mai...  False  False   \n",
+       "2  2021  Extension to Cullahill Community Centre Two st...  False  False   \n",
+       "4  2018  Supply and Fit and Supply only UPVC Doors and ...  False   True   \n",
+       "5  2018  The Supply of Materials and Associated Managed...  False   True   \n",
+       "6  2018  Catalyst Building Shell and Core Works Constru...  False  False   \n",
+       "\n",
+       "      50     80     73     45     71     79  ...     03     24     43     19  \\\n",
+       "0   True  False  False  False  False  False  ...  False  False  False  False   \n",
+       "2  False  False  False   True  False  False  ...  False  False  False  False   \n",
+       "4  False  False  False   True  False  False  ...  False  False  False  False   \n",
+       "5  False  False  False   True  False  False  ...   True   True   True  False   \n",
+       "6  False  False  False   True  False  False  ...  False  False  False  False   \n",
+       "\n",
+       "      41     37     14     16     76  \\\n",
+       "0  False  False  False  False  False   \n",
+       "2  False  False  False  False  False   \n",
+       "4  False  False  False  False  False   \n",
+       "5  False  False  False  False  False   \n",
+       "6  False  False  False  False  False   \n",
+       "\n",
+       "                                      processed_text  \n",
+       "0  crhNUMBER biomass boiler systems fuel supply m...  \n",
+       "2  extension cullahill community centre storey ex...  \n",
+       "4  supply fit supply upvc doors windows appointme...  \n",
+       "5  supply materials associated managed services p...  \n",
+       "6  catalyst building shell core works constructio...  \n",
+       "\n",
+       "[5 rows x 48 columns]"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "df = pd.read_csv('20231020-dataset_formatted_with_title_and_short_desription_combined-all-EN-notices-ted_with_preprocessing.csv', index_col=0, low_memory=False)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "74ecd602-0cbd-4388-8e7c-65be633f08df",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>year</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>264874.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>2018.005618</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>2.576416</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>2014.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>2016.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>2018.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>2020.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>2023.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                year\n",
+       "count  264874.000000\n",
+       "mean     2018.005618\n",
+       "std         2.576416\n",
+       "min      2014.000000\n",
+       "25%      2016.000000\n",
+       "50%      2018.000000\n",
+       "75%      2020.000000\n",
+       "max      2023.000000"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6d033dab-66be-4f05-8b08-d5a7bff04f1a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title_texte</th>\n",
+       "      <th>85</th>\n",
+       "      <th>44</th>\n",
+       "      <th>50</th>\n",
+       "      <th>80</th>\n",
+       "      <th>73</th>\n",
+       "      <th>45</th>\n",
+       "      <th>71</th>\n",
+       "      <th>79</th>\n",
+       "      <th>90</th>\n",
+       "      <th>...</th>\n",
+       "      <th>18</th>\n",
+       "      <th>03</th>\n",
+       "      <th>24</th>\n",
+       "      <th>43</th>\n",
+       "      <th>19</th>\n",
+       "      <th>41</th>\n",
+       "      <th>37</th>\n",
+       "      <th>14</th>\n",
+       "      <th>16</th>\n",
+       "      <th>76</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>CRH0485 Biomass Boiler Systems Fuel Supply Mai...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Extension to Cullahill Community Centre Two st...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Supply and Fit and Supply only UPVC Doors and ...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>The Supply of Materials and Associated Managed...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Catalyst Building Shell and Core Works Constru...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>True</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>...</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 46 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                         title_texte     85     44     50  \\\n",
+       "0  CRH0485 Biomass Boiler Systems Fuel Supply Mai...  False  False   True   \n",
+       "2  Extension to Cullahill Community Centre Two st...  False  False  False   \n",
+       "4  Supply and Fit and Supply only UPVC Doors and ...  False   True  False   \n",
+       "5  The Supply of Materials and Associated Managed...  False   True  False   \n",
+       "6  Catalyst Building Shell and Core Works Constru...  False  False  False   \n",
+       "\n",
+       "      80     73     45     71     79     90  ...     18     03     24     43  \\\n",
+       "0  False  False  False  False  False  False  ...  False  False  False  False   \n",
+       "2  False  False   True  False  False  False  ...  False  False  False  False   \n",
+       "4  False  False   True  False  False  False  ...  False  False  False  False   \n",
+       "5  False  False   True  False  False  False  ...  False   True   True   True   \n",
+       "6  False  False   True  False  False  False  ...  False  False  False  False   \n",
+       "\n",
+       "      19     41     37     14     16     76  \n",
+       "0  False  False  False  False  False  False  \n",
+       "2  False  False  False  False  False  False  \n",
+       "4  False  False  False  False  False  False  \n",
+       "5  False  False  False  False  False  False  \n",
+       "6  False  False  False  False  False  False  \n",
+       "\n",
+       "[5 rows x 46 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = df.drop(['year', 'processed_text'], axis=1)\n",
+    "df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "4c8cea5e-6ef2-4b73-bb92-0d1bda7236fa",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "msk = np.random.rand(len(df)) < 0.8\n",
+    "train = df[msk]\n",
+    "test = df[~msk]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "65f161a1-5d89-4630-904f-2946d3cbf602",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "labels = train.columns[1:]\n",
+    "\n",
+    "id2label = {idx:label for idx, label in enumerate(labels)}\n",
+    "label2id = {label:idx for idx, label in enumerate(labels)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "957cb7e3-98bb-4f26-8e07-389e2853833b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"roberta-base\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "f7315330-4f2c-4a05-a914-1949abd10f49",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pyarrow as pa\n",
+    "import pyarrow.dataset as ds\n",
+    "import pandas as pd\n",
+    "from datasets import Dataset\n",
+    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "datasettrain = Dataset(pa.Table.from_pandas(train))\n",
+    "datasettest = Dataset(pa.Table.from_pandas(test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "a508fc4c-38b8-463b-95a5-62ba09b0841e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def preprocess_data(df):\n",
+    "    text = df[\"title_texte\"]\n",
+    "    encoding = tokenizer(text, padding=\"max_length\", truncation=True, max_length=128, return_tensors=\"pt\").to(device)\n",
+    "    labels_batch = {k: df[k] for k in labels}\n",
+    "    labels_matrix = np.zeros((len(text), len(labels)))\n",
+    "    for idx, label in enumerate(labels):\n",
+    "        labels_matrix[:, idx] = labels_batch[label]\n",
+    "\n",
+    "    encoding[\"labels\"] = labels_matrix.tolist()\n",
+    "\n",
+    "    return encoding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "361d5f06-7c2a-4d28-871b-8401fe647a17",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "709634b56ea5445d8e88ce00b5c764f9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/212201 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "encoded_dataset = datasettrain.map(preprocess_data, batched=True, remove_columns=datasettrain.column_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "29a6f82f-2504-4f75-ab10-ccd16d7a08e6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fd4e2868e1fb4692a94710f8b213643c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/52673 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "encoded_dataset_test = datasettest.map(preprocess_data, batched=True, remove_columns=datasettest.column_names)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "28643d7a-cf68-443f-92f6-2aad42dfcc87",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "encoded_dataset.set_format(\"torch\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "5664abf4-7756-4cac-82fc-7a2d17e638d7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1d9e9f89ccea43beb2d689ea31afe864",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForSequenceClassification\n",
+    "\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\"roberta-base\",\n",
+    "                                                           problem_type=\"multi_label_classification\",\n",
+    "                                                           num_labels=len(labels),\n",
+    "                                                           id2label=id2label,\n",
+    "                                                           label2id=label2id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "9a2b807c-eaba-439b-bcd2-b9cd5787acbe",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model = model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "b81016f2-925c-44cf-8864-4a69c7f5c269",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "batch_size = 32\n",
+    "metric_name = \"f1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "e3285460-2b44-41a1-8de2-24bdf48241e9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments, Trainer\n",
+    "\n",
+    "args = TrainingArguments(\n",
+    "    f\"roberta-finetuned-CPV_English\",\n",
+    "    evaluation_strategy = \"epoch\",\n",
+    "    save_strategy = \"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=batch_size,\n",
+    "    per_device_eval_batch_size=batch_size,\n",
+    "    num_train_epochs=50,\n",
+    "    weight_decay=0.01,\n",
+    "    load_best_model_at_end=True,\n",
+    "    metric_for_best_model=metric_name,\n",
+    "    push_to_hub=False,\n",
+    "    fp16=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "28f35332-5dd3-4e97-b654-57dff8058a54",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import f1_score, roc_auc_score, accuracy_score\n",
+    "from sklearn.metrics import coverage_error\n",
+    "from sklearn.metrics import label_ranking_average_precision_score\n",
+    "from transformers import EvalPrediction\n",
+    "import torch\n",
+    "\n",
+    "# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/\n",
+    "def multi_label_metrics(predictions, labels, threshold=0.5):\n",
+    "    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)\n",
+    "    sigmoid = torch.nn.Sigmoid()\n",
+    "    probs = sigmoid(torch.Tensor(predictions))\n",
+    "    # next, use threshold to turn them into integer predictions\n",
+    "    y_pred = np.zeros(probs.shape)\n",
+    "    y_pred[np.where(probs >= threshold)] = 1\n",
+    "    # finally, compute metrics\n",
+    "    y_true = labels\n",
+    "    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')\n",
+    "    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')\n",
+    "    accuracy = accuracy_score(y_true, y_pred)\n",
+    "    coverage_err = coverage_error(y_true, y_pred)\n",
+    "    label_ranking_average_precision = label_ranking_average_precision_score(y_true, y_pred)\n",
+    "    # return as dictionary\n",
+    "    metrics = {'f1': f1_micro_average,\n",
+    "               'roc_auc': roc_auc,\n",
+    "               'accuracy': accuracy,\n",
+    "               'coverage_error': coverage_err,\n",
+    "               'label_ranking_average_precision_score': label_ranking_average_precision}\n",
+    "    return metrics\n",
+    "\n",
+    "def compute_metrics(p: EvalPrediction):\n",
+    "    preds = p.predictions[0] if isinstance(p.predictions,\n",
+    "            tuple) else p.predictions\n",
+    "    result = multi_label_metrics(\n",
+    "        predictions=preds,\n",
+    "        labels=p.label_ids)\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "554e180b-559f-46b1-a02e-59d2daed0b3c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "torch.cuda.is_available()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "472164ba-54cb-4851-8df5-d93dae391fd6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "trainer = Trainer(\n",
+    "    model,\n",
+    "    args,\n",
+    "    train_dataset=encoded_dataset,\n",
+    "    eval_dataset=encoded_dataset_test,\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34e9ac76-eea4-4797-b955-0419b762e263",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='25465' max='331600' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [ 25465/331600 1:41:53 < 20:25:01, 4.17 it/s, Epoch 3.84/50]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>F1</th>\n",
+       "      <th>Roc Auc</th>\n",
+       "      <th>Accuracy</th>\n",
+       "      <th>Coverage Error</th>\n",
+       "      <th>Label Ranking Average Precision Score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>0.064900</td>\n",
+       "      <td>0.061912</td>\n",
+       "      <td>0.657367</td>\n",
+       "      <td>0.777437</td>\n",
+       "      <td>0.524861</td>\n",
+       "      <td>19.282972</td>\n",
+       "      <td>0.659780</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>0.058100</td>\n",
+       "      <td>0.056515</td>\n",
+       "      <td>0.687337</td>\n",
+       "      <td>0.798397</td>\n",
+       "      <td>0.552465</td>\n",
+       "      <td>17.587739</td>\n",
+       "      <td>0.693517</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>0.053200</td>\n",
+       "      <td>0.055112</td>\n",
+       "      <td>0.692939</td>\n",
+       "      <td>0.796374</td>\n",
+       "      <td>0.568754</td>\n",
+       "      <td>17.750612</td>\n",
+       "      <td>0.702555</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "334d416d-bc87-4c5c-8100-d539d7d2b4fd",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'eval_loss': 0.10357002168893814,\n",
+       " 'eval_f1': 0.7764454799744905,\n",
+       " 'eval_roc_auc': 0.8742027330983739,\n",
+       " 'eval_accuracy': 0.6445807149773128,\n",
+       " 'eval_coverage_error': 12.356349552901866,\n",
+       " 'eval_label_ranking_average_precision_score': 0.7700240241623336,\n",
+       " 'eval_runtime': 108.3974,\n",
+       " 'eval_samples_per_second': 485.925,\n",
+       " 'eval_steps_per_second': 15.194,\n",
+       " 'epoch': 50.0}"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.evaluate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04438c1a-c35e-4f3a-bc52-75455b2b59af",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "trainer.save_model(\"./roberta-base-cpv-division-multilabel\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "646657d7-635a-47b5-9812-a94d85555277",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"./roberta-base-cpv-division-multilabel\", local_files_only=True)\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\"./roberta-base-cpv-division-multilabel\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "75204e74-58bb-4d58-8060-f565231d5453",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['38']\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_text = \"Acquisition of a UV-Visible spectrophotometer. The consultation concerns the acquisition, delivery, installation and commissioning of a UV-Visible spectrophotometer as well as associated services, including qualification, maintenance and training in the use of the equipment, for the European Directorate for the Quality of Medicines & Healthcare (EDQM).\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "encoded_input = tokenizer(input_text, return_tensors='pt', padding=True)\n",
+    "\n",
+    "results = model(**encoded_input)\n",
+    "logits = results.logits\n",
+    "sigmoid = torch.nn.Sigmoid()\n",
+    "probs = sigmoid(logits.squeeze().cpu())\n",
+    "predictions = np.zeros(probs.shape)\n",
+    "predictions[np.where(probs >= 0.5)] = 1\n",
+    "predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]\n",
+    "print(predicted_labels)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_pytorch_p310",
+   "language": "python",
+   "name": "conda_pytorch_p310"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}