diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..08c0aa9c25ceaa5274aaafb9551fe0b35fcd4b94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.idea +venv +*.csv +*.html \ No newline at end of file diff --git a/20230323-unsupervised-clustering-lda-first-experiments.ipynb b/20230323-unsupervised-clustering-lda-first-experiments.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1a77a710a25d0694712acef6e89be19f35ddc8bf --- /dev/null +++ b/20230323-unsupervised-clustering-lda-first-experiments.ipynb @@ -0,0 +1,1291 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Unsupervised clustering\n", + "\n", + "## Dataset loading" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 150, + "outputs": [ + { + "data": { + "text/plain": " title_texte 85 44 50 \\\n1 ipa supply equipment increase competitiveness... False True False \n3 provision language training service tender in... False False False \n4 service support eda helicopter portfolio main... False False False \n5 NUMBER cp op NUMBER pooling share cost non co... False False False \n6 edf supply transport household similar waste ... False False False \n\n 80 73 45 71 79 90 ... 18 03 24 43 \\\n1 False False False False False False ... False False False False \n3 True False False False False False ... False False False False \n4 True False False False False False ... False False False False \n5 False True False False False False ... False False False False \n6 False False True False False False ... False False False False \n\n 19 41 37 14 16 76 \n1 False False False False False False \n3 False False False False False False \n4 False False False False False False \n5 False False False False False False \n6 False False False False False False \n\n[5 rows x 46 columns]", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>title_texte</th>\n <th>85</th>\n <th>44</th>\n <th>50</th>\n <th>80</th>\n <th>73</th>\n <th>45</th>\n <th>71</th>\n <th>79</th>\n <th>90</th>\n <th>...</th>\n <th>18</th>\n <th>03</th>\n <th>24</th>\n <th>43</th>\n <th>19</th>\n <th>41</th>\n <th>37</th>\n <th>14</th>\n <th>16</th>\n <th>76</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>ipa supply equipment increase competitiveness...</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>3</th>\n <td>provision language training service tender in...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>4</th>\n <td>service support eda helicopter portfolio main...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>5</th>\n <td>NUMBER cp op NUMBER pooling share cost non co...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>6</th>\n <td>edf supply transport household similar waste ...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 46 columns</p>\n</div>" + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"20230214-dataset_preprocessed_with_lemma.csv\", index_col=0)\n", + "df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 151, + "outputs": [ + { + "data": { + "text/plain": "((11647, 46), (2912, 46))" + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "cpvs = [c for c in df.columns if len(c) == 2]\n", + "df_train, df_test = train_test_split(df, test_size=0.2, shuffle=False)\n", + "(df_train.shape, df_test.shape)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 152, + "outputs": [ + { + "data": { + "text/plain": "{'85': 256,\n '44': 103,\n '50': 297,\n '80': 403,\n '73': 1067,\n '45': 731,\n '71': 1621,\n '79': 2682,\n '90': 629,\n '30': 266,\n '35': 145,\n '33': 158,\n '55': 117,\n '72': 914,\n '48': 199,\n '38': 289,\n '09': 128,\n '75': 277,\n '66': 206,\n '64': 148,\n '42': 159,\n '34': 199,\n '60': 122,\n '92': 169,\n '39': 188,\n '31': 139,\n '98': 123,\n '51': 50,\n '32': 185,\n '65': 29,\n '77': 83,\n '22': 61,\n '63': 144,\n '15': 43,\n '70': 44,\n '18': 35,\n '03': 31,\n '24': 30,\n '43': 17,\n '19': 7,\n '41': 13,\n '37': 13,\n '14': 16,\n '16': 5,\n '76': 5}" + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{c: df_train[c].sum() for c in cpvs}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 153, + "outputs": [ + { + "data": { + "text/plain": "{'85': 62,\n '44': 23,\n '50': 68,\n '80': 99,\n '73': 249,\n '45': 191,\n '71': 404,\n '79': 688,\n '90': 176,\n '30': 70,\n '35': 36,\n '33': 38,\n '55': 30,\n '72': 197,\n '48': 43,\n '38': 61,\n '09': 34,\n '75': 77,\n '66': 46,\n '64': 37,\n '42': 27,\n '34': 50,\n '60': 41,\n '92': 43,\n '39': 49,\n '31': 36,\n '98': 35,\n '51': 8,\n '32': 40,\n '65': 15,\n '77': 22,\n '22': 20,\n '63': 34,\n '15': 5,\n '70': 14,\n '18': 5,\n '03': 5,\n '24': 11,\n '43': 1,\n '19': 5,\n '41': 2,\n '37': 3,\n '14': 6,\n '16': 4,\n '76': 3}" + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{c: df_test[c].sum() for c in cpvs}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Topic creation using LDA" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 154, + "outputs": [ + { + "data": { + "text/plain": "1 [, ipa, supply, equipment, increase, competiti...\n3 [, provision, language, training, service, ten...\n4 [, service, support, eda, helicopter, portfoli...\n5 [, NUMBER, cp, op, NUMBER, pooling, share, cos...\n6 [, edf, supply, transport, household, similar,...\nName: title_texte, dtype: object" + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "processed_docs = df_train[\"title_texte\"].apply(lambda x: x.split(\" \"))\n", + "processed_docs.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 155, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 'NUMBER'), (1, 'ask'), (2, 'authority'), (3, 'black'), (4, 'cheap'), (5, 'competitiveness'), (6, 'compliant'), (7, 'contract'), (8, 'contracting'), (9, 'countersign')]\n" + ] + } + ], + "source": [ + "import gensim\n", + "\n", + "dictionary = gensim.corpora.Dictionary(processed_docs)\n", + "dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)\n", + "print(list(dictionary.iteritems())[:10])" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Using bag-of-words" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 156, + "outputs": [ + { + "data": { + "text/plain": "[[(0, 2),\n (1, 1),\n (2, 1),\n (3, 1),\n (4, 1),\n (5, 1),\n (6, 1),\n (7, 6),\n (8, 1),\n (9, 1),\n (10, 1),\n (11, 1),\n (12, 1),\n (13, 1),\n (14, 3),\n (15, 1),\n (16, 1),\n (17, 1),\n (18, 1),\n (19, 1),\n (20, 1),\n (21, 1),\n (22, 1),\n (23, 3),\n (24, 1),\n (25, 1),\n (26, 1),\n (27, 1),\n (28, 1),\n (29, 1),\n (30, 1),\n (31, 1),\n (32, 1),\n (33, 1),\n (34, 1),\n (35, 1),\n (36, 4),\n (37, 1)],\n [(7, 1),\n (35, 1),\n (38, 1),\n (39, 1),\n (40, 1),\n (41, 1),\n (42, 1),\n (43, 1),\n (44, 1),\n (45, 1),\n (46, 1),\n (47, 1),\n (48, 1)],\n [(7, 1),\n (23, 1),\n (45, 1),\n (47, 3),\n (48, 1),\n (49, 1),\n (50, 1),\n (51, 2),\n (52, 1),\n (53, 1),\n (54, 1),\n (55, 1),\n (56, 1),\n (57, 1),\n (58, 1),\n (59, 2),\n (60, 1),\n (61, 2),\n (62, 1)],\n [(0, 3),\n (18, 1),\n (21, 1),\n (50, 1),\n (62, 1),\n (63, 2),\n (64, 1),\n (65, 1),\n (66, 1),\n (67, 1),\n (68, 2),\n (69, 1),\n (70, 1),\n (71, 1),\n (72, 1),\n (73, 1),\n (74, 1),\n (75, 1),\n (76, 1),\n (77, 1),\n (78, 1),\n (79, 1),\n (80, 1),\n (81, 1),\n (82, 1),\n (83, 1),\n (84, 1),\n (85, 1),\n (86, 1),\n (87, 1),\n (88, 1),\n (89, 1),\n (90, 2),\n (91, 1),\n (92, 1),\n (93, 1),\n (94, 1),\n (95, 2),\n (96, 1),\n (97, 1),\n (98, 1),\n (99, 1)],\n [(0, 2),\n (12, 1),\n (34, 1),\n (100, 1),\n (101, 1),\n (102, 1),\n (103, 1),\n (104, 1),\n (105, 1),\n (106, 1),\n (107, 1)],\n [(0, 2),\n (16, 1),\n (30, 1),\n (32, 1),\n (107, 1),\n (108, 2),\n (109, 2),\n (110, 1),\n (111, 1),\n (112, 1),\n (113, 1),\n (114, 1),\n (115, 1),\n (116, 1),\n (117, 2),\n (118, 1),\n (119, 1),\n (120, 1),\n (121, 1),\n (122, 1),\n (123, 1),\n (124, 1),\n (125, 1),\n (126, 1),\n (127, 1)],\n [(0, 1),\n (7, 1),\n (35, 1),\n (47, 2),\n (57, 1),\n (75, 1),\n (119, 1),\n (128, 1),\n (129, 1),\n (130, 1),\n (131, 1),\n (132, 1),\n (133, 1),\n (134, 1),\n (135, 1),\n (136, 1),\n (137, 1),\n (138, 1),\n (139, 2),\n (140, 1),\n (141, 1),\n (142, 1)],\n [(22, 1),\n (28, 1),\n (35, 5),\n (36, 1),\n (47, 1),\n (48, 4),\n (109, 1),\n (112, 1),\n (143, 2),\n (144, 1),\n (145, 1),\n (146, 1),\n (147, 1),\n (148, 1),\n (149, 2),\n (150, 1),\n (151, 2),\n (152, 1),\n (153, 2),\n (154, 1),\n (155, 2),\n (156, 1),\n (157, 3),\n (158, 1),\n (159, 1),\n (160, 1),\n (161, 1),\n (162, 1),\n (163, 1),\n (164, 1),\n (165, 1),\n (166, 1),\n (167, 1),\n (168, 1),\n (169, 1),\n (170, 1),\n (171, 1),\n (172, 1)],\n [(0, 3),\n (2, 1),\n (8, 1),\n (10, 1),\n (17, 1),\n (34, 1),\n (35, 1),\n (95, 1),\n (102, 1),\n (120, 1),\n (124, 1),\n (147, 1),\n (150, 1),\n (169, 1),\n (173, 1),\n (174, 1),\n (175, 1),\n (176, 1),\n (177, 1),\n (178, 1),\n (179, 1),\n (180, 1),\n (181, 1),\n (182, 2)],\n [(0, 2),\n (7, 3),\n (12, 1),\n (23, 1),\n (35, 1),\n (47, 5),\n (57, 1),\n (75, 2),\n (99, 1),\n (117, 1),\n (119, 1),\n (128, 1),\n (129, 1),\n (130, 2),\n (132, 2),\n (135, 1),\n (138, 1),\n (141, 1),\n (142, 2),\n (183, 1),\n (184, 3),\n (185, 1),\n (186, 1),\n (187, 1),\n (188, 1),\n (189, 1),\n (190, 1),\n (191, 1),\n (192, 2),\n (193, 1),\n (194, 1),\n (195, 1)]]" + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n", + "bow_corpus[:10]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 157, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Topic: 0 --\n", + "0.091*\"service\" + 0.064*\"event\" + 0.032*\"medical\" + 0.031*\"travel\" + 0.028*\"provision\" + 0.026*\"organisation\" + 0.023*\"agency\" + 0.019*\"conference\" + 0.017*\"provide\" + 0.015*\"online\"\n", + "\n", + "-- Topic: 1 --\n", + "0.074*\"medium\" + 0.052*\"service\" + 0.046*\"european\" + 0.027*\"social\" + 0.026*\"parliament\" + 0.023*\"monitoring\" + 0.022*\"committee\" + 0.022*\"communication\" + 0.019*\"contract\" + 0.019*\"NUMBER\"\n", + "\n", + "-- Topic: 2 --\n", + "0.128*\"NUMBER\" + 0.056*\"directive\" + 0.051*\"regulation\" + 0.035*\"council\" + 0.033*\"eu\" + 0.030*\"ec\" + 0.029*\"european\" + 0.025*\"commission\" + 0.019*\"article\" + 0.018*\"parliament\"\n", + "\n", + "-- Topic: 3 --\n", + "0.031*\"system\" + 0.028*\"contract\" + 0.021*\"support\" + 0.019*\"iter\" + 0.015*\"study\" + 0.014*\"service\" + 0.013*\"provide\" + 0.013*\"development\" + 0.013*\"capability\" + 0.012*\"NUMBER\"\n", + "\n", + "-- Topic: 4 --\n", + "0.053*\"service\" + 0.046*\"production\" + 0.045*\"document\" + 0.040*\"product\" + 0.030*\"language\" + 0.029*\"design\" + 0.024*\"material\" + 0.022*\"tender\" + 0.021*\"relate\" + 0.020*\"contract\"\n", + "\n", + "-- Topic: 5 --\n", + "0.077*\"maintenance\" + 0.071*\"installation\" + 0.055*\"supply\" + 0.044*\"system\" + 0.023*\"NUMBER\" + 0.023*\"jrc\" + 0.016*\"equipment\" + 0.015*\"plant\" + 0.014*\"contract\" + 0.014*\"include\"\n", + "\n", + "-- Topic: 6 --\n", + "0.301*\"NUMBER\" + 0.250*\"lot\" + 0.040*\"divide\" + 0.034*\"supply\" + 0.018*\"contract\" + 0.017*\"tender\" + 0.012*\"electricity\" + 0.012*\"ecb\" + 0.011*\"main\" + 0.011*\"follow\"\n", + "\n", + "-- Topic: 7 --\n", + "0.118*\"training\" + 0.036*\"NUMBER\" + 0.028*\"programme\" + 0.021*\"course\" + 0.020*\"activity\" + 0.019*\"protection\" + 0.019*\"organisation\" + 0.016*\"civil\" + 0.013*\"contract\" + 0.013*\"initiative\"\n", + "\n", + "-- Topic: 8 --\n", + "0.081*\"NUMBER\" + 0.057*\"authority\" + 0.051*\"contracting\" + 0.046*\"address\" + 0.042*\"day\" + 0.040*\"point\" + 0.039*\"email\" + 0.038*\"seek\" + 0.037*\"submission\" + 0.037*\"deadline\"\n", + "\n", + "-- Topic: 9 --\n", + "0.085*\"construction\" + 0.064*\"water\" + 0.056*\"work\" + 0.037*\"eib\" + 0.025*\"supervision\" + 0.025*\"project\" + 0.020*\"design\" + 0.018*\"treatment\" + 0.017*\"plant\" + 0.016*\"system\"\n", + "\n", + "-- Topic: 10 --\n", + "0.081*\"project\" + 0.065*\"technical\" + 0.061*\"assistance\" + 0.047*\"subject\" + 0.035*\"authority\" + 0.033*\"contract\" + 0.030*\"contracting\" + 0.029*\"extend\" + 0.028*\"scope\" + 0.028*\"availability\"\n", + "\n", + "-- Topic: 11 --\n", + "0.208*\"service\" + 0.085*\"provision\" + 0.022*\"procedure\" + 0.021*\"consultancy\" + 0.021*\"procurement\" + 0.020*\"relate\" + 0.019*\"provider\" + 0.017*\"ict\" + 0.016*\"management\" + 0.015*\"related\"\n", + "\n", + "-- Topic: 12 --\n", + "0.099*\"service\" + 0.049*\"european\" + 0.041*\"contract\" + 0.037*\"tender\" + 0.036*\"company\" + 0.035*\"framework\" + 0.034*\"eea\" + 0.033*\"security\" + 0.033*\"union\" + 0.033*\"NUMBER\"\n", + "\n", + "-- Topic: 13 --\n", + "0.077*\"translation\" + 0.035*\"european\" + 0.035*\"union\" + 0.035*\"french\" + 0.032*\"body\" + 0.029*\"centre\" + 0.029*\"english\" + 0.025*\"german\" + 0.023*\"property\" + 0.023*\"framework\"\n", + "\n", + "-- Topic: 14 --\n", + "0.054*\"emergency\" + 0.049*\"ipa\" + 0.026*\"woman\" + 0.023*\"georgia\" + 0.021*\"turkey\" + 0.020*\"country\" + 0.020*\"south\" + 0.018*\"response\" + 0.018*\"assistance\" + 0.018*\"negotiation\"\n", + "\n", + "-- Topic: 15 --\n", + "0.042*\"skill\" + 0.041*\"industrial\" + 0.032*\"labour\" + 0.029*\"improve\" + 0.025*\"rail\" + 0.021*\"market\" + 0.018*\"housing\" + 0.018*\"reporting\" + 0.016*\"purchase\" + 0.015*\"register\"\n", + "\n", + "-- Topic: 16 --\n", + "0.169*\"equipment\" + 0.089*\"supply\" + 0.064*\"insurance\" + 0.031*\"purchase\" + 0.027*\"computer\" + 0.027*\"installation\" + 0.023*\"service\" + 0.022*\"furniture\" + 0.021*\"hospital\" + 0.020*\"accident\"\n", + "\n", + "-- Topic: 17 --\n", + "0.037*\"project\" + 0.030*\"sme\" + 0.029*\"helpdesk\" + 0.025*\"payment\" + 0.022*\"financing\" + 0.022*\"pilot\" + 0.022*\"enterprise\" + 0.021*\"development\" + 0.016*\"high\" + 0.015*\"service\"\n", + "\n", + "-- Topic: 18 --\n", + "0.070*\"study\" + 0.051*\"eu\" + 0.043*\"analysis\" + 0.023*\"european\" + 0.021*\"provide\" + 0.020*\"market\" + 0.019*\"information\" + 0.015*\"product\" + 0.014*\"feasibility\" + 0.013*\"NUMBER\"\n", + "\n", + "-- Topic: 19 --\n", + "0.059*\"risk\" + 0.043*\"food\" + 0.043*\"assessment\" + 0.035*\"safety\" + 0.027*\"efsa\" + 0.024*\"health\" + 0.022*\"laboratory\" + 0.022*\"scientific\" + 0.020*\"chemical\" + 0.020*\"procurement\"\n", + "\n", + "-- Topic: 20 --\n", + "0.088*\"development\" + 0.069*\"system\" + 0.059*\"support\" + 0.054*\"maintenance\" + 0.046*\"software\" + 0.036*\"information\" + 0.031*\"service\" + 0.030*\"management\" + 0.023*\"solution\" + 0.022*\"application\"\n", + "\n", + "-- Topic: 21 --\n", + "0.116*\"service\" + 0.115*\"framework\" + 0.107*\"contract\" + 0.081*\"provision\" + 0.036*\"staff\" + 0.027*\"office\" + 0.023*\"multiple\" + 0.019*\"conclude\" + 0.018*\"tender\" + 0.016*\"agency\"\n", + "\n", + "-- Topic: 22 --\n", + "0.096*\"contract\" + 0.063*\"framework\" + 0.055*\"NUMBER\" + 0.053*\"year\" + 0.026*\"lot\" + 0.025*\"order\" + 0.024*\"award\" + 0.021*\"maximum\" + 0.021*\"contractor\" + 0.021*\"conclusion\"\n", + "\n", + "-- Topic: 23 --\n", + "0.032*\"NUMBER\" + 0.029*\"member\" + 0.028*\"eu\" + 0.026*\"state\" + 0.022*\"study\" + 0.015*\"national\" + 0.013*\"cost\" + 0.013*\"use\" + 0.013*\"provide\" + 0.012*\"objective\"\n", + "\n", + "-- Topic: 24 --\n", + "0.045*\"health\" + 0.027*\"mobile\" + 0.024*\"occupational\" + 0.023*\"radio\" + 0.021*\"statistical\" + 0.019*\"telecommunication\" + 0.018*\"transmission\" + 0.015*\"supply\" + 0.015*\"multi\" + 0.015*\"measurement\"\n", + "\n", + "-- Topic: 25 --\n", + "0.136*\"security\" + 0.072*\"service\" + 0.045*\"system\" + 0.044*\"provide\" + 0.040*\"delegation\" + 0.038*\"european\" + 0.036*\"union\" + 0.033*\"resource\" + 0.028*\"human\" + 0.025*\"provision\"\n", + "\n", + "-- Topic: 26 --\n", + "0.056*\"support\" + 0.039*\"policy\" + 0.035*\"eu\" + 0.024*\"environmental\" + 0.022*\"implementation\" + 0.021*\"commission\" + 0.020*\"contract\" + 0.019*\"provide\" + 0.018*\"development\" + 0.015*\"indicator\"\n", + "\n", + "-- Topic: 27 --\n", + "0.078*\"vehicle\" + 0.046*\"court\" + 0.044*\"european\" + 0.041*\"test\" + 0.032*\"justice\" + 0.032*\"emission\" + 0.031*\"union\" + 0.023*\"car\" + 0.023*\"supply\" + 0.023*\"fuel\"\n", + "\n", + "-- Topic: 28 --\n", + "0.166*\"energy\" + 0.050*\"house\" + 0.044*\"service\" + 0.043*\"eu\" + 0.037*\"reception\" + 0.031*\"efficiency\" + 0.030*\"tender\" + 0.028*\"renewable\" + 0.027*\"premise\" + 0.020*\"security\"\n", + "\n", + "-- Topic: 29 --\n", + "0.128*\"contract\" + 0.087*\"tenderer\" + 0.066*\"guarantee\" + 0.064*\"provide\" + 0.044*\"NUMBER\" + 0.023*\"return\" + 0.022*\"period\" + 0.022*\"later\" + 0.022*\"performance\" + 0.022*\"receive\"\n", + "\n", + "-- Topic: 30 --\n", + "0.045*\"NUMBER\" + 0.041*\"supply\" + 0.033*\"room\" + 0.032*\"gas\" + 0.023*\"galileo\" + 0.021*\"fuel\" + 0.020*\"use\" + 0.018*\"satellite\" + 0.016*\"natural\" + 0.014*\"heating\"\n", + "\n", + "-- Topic: 31 --\n", + "0.125*\"datum\" + 0.023*\"collection\" + 0.023*\"review\" + 0.022*\"eu\" + 0.018*\"data\" + 0.017*\"country\" + 0.014*\"approach\" + 0.013*\"research\" + 0.012*\"study\" + 0.012*\"objective\"\n", + "\n", + "-- Topic: 32 --\n", + "0.042*\"support\" + 0.029*\"task\" + 0.025*\"provide\" + 0.025*\"coordination\" + 0.021*\"specific\" + 0.019*\"contract\" + 0.017*\"technical\" + 0.017*\"service\" + 0.016*\"ecdc\" + 0.016*\"video\"\n", + "\n", + "-- Topic: 33 --\n", + "0.055*\"evaluation\" + 0.045*\"impact\" + 0.044*\"assessment\" + 0.037*\"study\" + 0.030*\"NUMBER\" + 0.028*\"economic\" + 0.024*\"policy\" + 0.023*\"contract\" + 0.022*\"analysis\" + 0.015*\"gender\"\n", + "\n", + "-- Topic: 34 --\n", + "0.108*\"contract\" + 0.075*\"tenderer\" + 0.052*\"bond\" + 0.045*\"successful\" + 0.045*\"NUMBER\" + 0.040*\"sign\" + 0.036*\"provide\" + 0.036*\"require\" + 0.022*\"price\" + 0.021*\"time\"\n", + "\n", + "-- Topic: 35 --\n", + "0.071*\"service\" + 0.058*\"eib\" + 0.049*\"management\" + 0.040*\"ecb\" + 0.032*\"consultancy\" + 0.028*\"provision\" + 0.026*\"support\" + 0.024*\"bank\" + 0.021*\"group\" + 0.019*\"framework\"\n", + "\n", + "-- Topic: 36 --\n", + "0.133*\"NUMBER\" + 0.030*\"audit\" + 0.027*\"general\" + 0.024*\"financial\" + 0.023*\"eu\" + 0.022*\"service\" + 0.021*\"contract\" + 0.020*\"gsa\" + 0.020*\"MONTH\" + 0.018*\"budget\"\n", + "\n", + "-- Topic: 37 --\n", + "0.055*\"infrastructure\" + 0.055*\"operation\" + 0.052*\"access\" + 0.030*\"database\" + 0.026*\"support\" + 0.024*\"investment\" + 0.020*\"implementation\" + 0.019*\"finance\" + 0.016*\"subscription\" + 0.016*\"platform\"\n", + "\n", + "-- Topic: 38 --\n", + "0.135*\"NUMBER\" + 0.077*\"road\" + 0.069*\"work\" + 0.041*\"construction\" + 0.037*\"km\" + 0.028*\"railway\" + 0.024*\"rehabilitation\" + 0.022*\"line\" + 0.018*\"section\" + 0.017*\"safety\"\n", + "\n", + "-- Topic: 39 --\n", + "0.096*\"european\" + 0.078*\"building\" + 0.057*\"parliament\" + 0.039*\"brussels\" + 0.038*\"service\" + 0.030*\"contract\" + 0.027*\"luxembourg\" + 0.022*\"NUMBER\" + 0.021*\"strasbourg\" + 0.019*\"work\"\n", + "\n", + "-- Topic: 40 --\n", + "0.056*\"administrative\" + 0.055*\"legal\" + 0.053*\"law\" + 0.050*\"advice\" + 0.039*\"sea\" + 0.027*\"service\" + 0.026*\"different\" + 0.021*\"cultural\" + 0.020*\"assistance\" + 0.018*\"learning\"\n", + "\n", + "-- Topic: 41 --\n", + "0.035*\"climate\" + 0.033*\"copernicus\" + 0.032*\"service\" + 0.030*\"NUMBER\" + 0.028*\"support\" + 0.027*\"change\" + 0.022*\"global\" + 0.022*\"activity\" + 0.019*\"land\" + 0.015*\"monitoring\"\n", + "\n", + "-- Topic: 42 --\n", + "0.074*\"service\" + 0.069*\"european\" + 0.054*\"communication\" + 0.049*\"commission\" + 0.029*\"information\" + 0.023*\"public\" + 0.022*\"representation\" + 0.021*\"provision\" + 0.019*\"activity\" + 0.016*\"provide\"\n", + "\n", + "-- Topic: 43 --\n", + "0.081*\"european\" + 0.077*\"network\" + 0.067*\"education\" + 0.038*\"school\" + 0.035*\"support\" + 0.023*\"commission\" + 0.018*\"high\" + 0.018*\"programme\" + 0.014*\"research\" + 0.013*\"procure\"\n", + "\n", + "-- Topic: 44 --\n", + "0.090*\"application\" + 0.087*\"form\" + 0.045*\"section\" + 0.045*\"standard\" + 0.045*\"NUMBER\" + 0.044*\"candidate\" + 0.043*\"situation\" + 0.043*\"submit\" + 0.042*\"sign\" + 0.042*\"include\"\n", + "\n", + "-- Topic: 45 --\n", + "0.105*\"transport\" + 0.066*\"survey\" + 0.038*\"european\" + 0.038*\"international\" + 0.030*\"statistic\" + 0.029*\"aviation\" + 0.022*\"safety\" + 0.021*\"level\" + 0.018*\"service\" + 0.018*\"agency\"\n", + "\n", + "-- Topic: 46 --\n", + "0.074*\"site\" + 0.069*\"jrc\" + 0.044*\"centre\" + 0.042*\"ispra\" + 0.036*\"joint\" + 0.034*\"research\" + 0.028*\"building\" + 0.027*\"service\" + 0.025*\"work\" + 0.024*\"NUMBER\"\n", + "\n", + "-- Topic: 47 --\n", + "0.046*\"NUMBER\" + 0.035*\"service\" + 0.030*\"device\" + 0.029*\"equipment\" + 0.022*\"time\" + 0.022*\"frontex\" + 0.020*\"acquisition\" + 0.019*\"provision\" + 0.017*\"supply\" + 0.016*\"purchase\"\n", + "\n", + "-- Topic: 48 --\n", + "0.030*\"study\" + 0.022*\"digital\" + 0.021*\"practice\" + 0.021*\"sector\" + 0.020*\"market\" + 0.020*\"eu\" + 0.019*\"good\" + 0.019*\"innovation\" + 0.018*\"objective\" + 0.017*\"smart\"\n", + "\n", + "-- Topic: 49 --\n", + "0.075*\"waste\" + 0.050*\"epo\" + 0.046*\"right\" + 0.043*\"nuclear\" + 0.042*\"allow\" + 0.041*\"de\" + 0.035*\"reduce\" + 0.035*\"safeguard\" + 0.034*\"proposal\" + 0.030*\"effect\"\n", + "\n" + ] + } + ], + "source": [ + "# 20 passes seems to give better results than 2\n", + "lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=50, id2word=dictionary, passes=20, workers=4,\n", + " minimum_probability=0.0)\n", + "for idx, topic in lda_model.print_topics(-1):\n", + " print('-- Topic: {} --\\n{}\\n'.format(idx, topic))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Using TFIDF" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 158, + "outputs": [ + { + "data": { + "text/plain": "[(0, 0.05665082498263276),\n (1, 0.12807236880516995),\n (2, 0.07933089660064745),\n (3, 0.25230586430222485),\n (4, 0.1299596107526431),\n (5, 0.20710308682000417),\n (6, 0.11895117349717746),\n (7, 0.22539927799416767),\n (8, 0.08384683915242945),\n (9, 0.12102247861336683),\n (10, 0.09923408933597923),\n (11, 0.11454595197543345),\n (12, 0.10104473789728612),\n (13, 0.1293204098325476),\n (14, 0.36893131914584504),\n (15, 0.16137963960080237),\n (16, 0.10373794343782906),\n (17, 0.12669292935869714),\n (18, 0.2590612962599304),\n (19, 0.09142624414586832),\n (20, 0.09991406288160162),\n (21, 0.11075409983330753),\n (22, 0.08652843993418363),\n (23, 0.19053225489254916),\n (24, 0.11678553178749576),\n (25, 0.14961287443002041),\n (26, 0.1268627351970124),\n (27, 0.19781064621306588),\n (28, 0.11075409983330753),\n (29, 0.11916045521835175),\n (30, 0.08038608899477187),\n (31, 0.1293204098325476),\n (32, 0.08192889826513393),\n (33, 0.11338158030715803),\n (34, 0.07677484702566353),\n (35, 0.06298784767348975),\n (36, 0.44598112748464985),\n (37, 0.12008044297830643)]" + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from gensim.models import TfidfModel\n", + "\n", + "tfidf = TfidfModel(bow_corpus)\n", + "tfidf_corpus = tfidf[bow_corpus]\n", + "tfidf_corpus[0]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 159, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Topic: 0 --\n", + "0.076*\"bond\" + 0.063*\"tenderer\" + 0.034*\"successful\" + 0.032*\"contract\" + 0.031*\"require\" + 0.028*\"price\" + 0.025*\"low\" + 0.024*\"render\" + 0.024*\"sign\" + 0.024*\"date\"\n", + "\n", + "-- Topic: 1 --\n", + "0.044*\"security\" + 0.037*\"fwc\" + 0.037*\"delegation\" + 0.032*\"ser\" + 0.031*\"specialise\" + 0.029*\"company\" + 0.025*\"ref\" + 0.024*\"union\" + 0.023*\"eea\" + 0.022*\"person\"\n", + "\n", + "-- Topic: 2 --\n", + "0.039*\"copernicus\" + 0.017*\"land\" + 0.015*\"climate\" + 0.015*\"aid\" + 0.014*\"ecmwf\" + 0.012*\"cap\" + 0.011*\"global\" + 0.011*\"natura\" + 0.009*\"change\" + 0.009*\"service\"\n", + "\n", + "-- Topic: 3 --\n", + "0.033*\"electricity\" + 0.025*\"fuel\" + 0.019*\"corporate\" + 0.017*\"renewable\" + 0.017*\"radio\" + 0.016*\"ceb\" + 0.015*\"enhancement\" + 0.014*\"spatial\" + 0.014*\"energy\" + 0.013*\"planning\"\n", + "\n", + "-- Topic: 4 --\n", + "0.064*\"translation\" + 0.032*\"language\" + 0.026*\"french\" + 0.024*\"english\" + 0.022*\"german\" + 0.020*\"centre\" + 0.019*\"text\" + 0.018*\"body\" + 0.018*\"italian\" + 0.018*\"spanish\"\n", + "\n", + "-- Topic: 5 --\n", + "0.022*\"gas\" + 0.017*\"natural\" + 0.016*\"lift\" + 0.016*\"occupy\" + 0.014*\"european\" + 0.014*\"brussels\" + 0.013*\"competition\" + 0.013*\"building\" + 0.012*\"capital\" + 0.012*\"associated\"\n", + "\n", + "-- Topic: 6 --\n", + "0.062*\"training\" + 0.026*\"food\" + 0.024*\"safe\" + 0.019*\"gender\" + 0.017*\"initiative\" + 0.016*\"equality\" + 0.015*\"organisation\" + 0.015*\"activity\" + 0.014*\"participant\" + 0.014*\"certification\"\n", + "\n", + "-- Topic: 7 --\n", + "0.026*\"property\" + 0.026*\"frontex\" + 0.021*\"office\" + 0.020*\"ecdc\" + 0.020*\"floor\" + 0.020*\"accessory\" + 0.019*\"malta\" + 0.015*\"easo\" + 0.014*\"asylum\" + 0.014*\"end\"\n", + "\n", + "-- Topic: 8 --\n", + "0.076*\"form\" + 0.067*\"application\" + 0.040*\"exclusion\" + 0.040*\"declaration\" + 0.040*\"situation\" + 0.040*\"candidate\" + 0.039*\"section\" + 0.038*\"standard\" + 0.037*\"effect\" + 0.035*\"list\"\n", + "\n", + "-- Topic: 9 --\n", + "0.050*\"ict\" + 0.038*\"helpdesk\" + 0.027*\"service\" + 0.027*\"maintenance\" + 0.021*\"upgrade\" + 0.017*\"provision\" + 0.015*\"operational\" + 0.015*\"interpol\" + 0.015*\"specification\" + 0.014*\"data\"\n", + "\n", + "-- Topic: 10 --\n", + "0.062*\"epo\" + 0.040*\"safeguard\" + 0.038*\"variant\" + 0.037*\"allow\" + 0.037*\"significantly\" + 0.036*\"proposal\" + 0.036*\"right\" + 0.035*\"reduce\" + 0.024*\"munich\" + 0.020*\"effect\"\n", + "\n", + "-- Topic: 11 --\n", + "0.032*\"building\" + 0.024*\"jrc\" + 0.022*\"parliament\" + 0.021*\"brussels\" + 0.021*\"installation\" + 0.020*\"strasbourg\" + 0.019*\"site\" + 0.017*\"maintenance\" + 0.017*\"luxembourg\" + 0.016*\"karlsruhe\"\n", + "\n", + "-- Topic: 12 --\n", + "0.022*\"radiation\" + 0.019*\"protection\" + 0.016*\"surface\" + 0.015*\"fishery\" + 0.014*\"civil\" + 0.013*\"secondary\" + 0.013*\"university\" + 0.012*\"train\" + 0.011*\"international\" + 0.011*\"echo\"\n", + "\n", + "-- Topic: 13 --\n", + "0.052*\"software\" + 0.021*\"maintenance\" + 0.021*\"portal\" + 0.020*\"system\" + 0.019*\"development\" + 0.018*\"acquisition\" + 0.018*\"solution\" + 0.017*\"architecture\" + 0.017*\"housing\" + 0.016*\"hardware\"\n", + "\n", + "-- Topic: 14 --\n", + "0.061*\"court\" + 0.051*\"justice\" + 0.025*\"enisa\" + 0.021*\"real\" + 0.019*\"union\" + 0.017*\"auditor\" + 0.016*\"minimum\" + 0.015*\"strengthening\" + 0.015*\"metro\" + 0.015*\"dam\"\n", + "\n", + "-- Topic: 15 --\n", + "0.028*\"administration\" + 0.027*\"reform\" + 0.027*\"assistance\" + 0.027*\"technical\" + 0.021*\"financing\" + 0.021*\"government\" + 0.021*\"agreement\" + 0.021*\"finance\" + 0.020*\"society\" + 0.020*\"edf\"\n", + "\n", + "-- Topic: 16 --\n", + "0.032*\"print\" + 0.030*\"communication\" + 0.029*\"electronic\" + 0.025*\"logistic\" + 0.023*\"paper\" + 0.022*\"campaign\" + 0.021*\"book\" + 0.021*\"publication\" + 0.021*\"ohim\" + 0.018*\"creative\"\n", + "\n", + "-- Topic: 17 --\n", + "0.058*\"furniture\" + 0.052*\"cleaning\" + 0.040*\"clean\" + 0.034*\"office\" + 0.030*\"euipo\" + 0.028*\"waste\" + 0.023*\"premise\" + 0.022*\"ad\" + 0.020*\"hoc\" + 0.019*\"supply\"\n", + "\n", + "-- Topic: 18 --\n", + "0.046*\"audit\" + 0.025*\"auditing\" + 0.022*\"replacement\" + 0.022*\"board\" + 0.022*\"satellite\" + 0.021*\"tourism\" + 0.020*\"cater\" + 0.020*\"internal\" + 0.018*\"heat\" + 0.018*\"lead\"\n", + "\n", + "-- Topic: 19 --\n", + "0.037*\"easo\" + 0.033*\"event\" + 0.030*\"cedefop\" + 0.028*\"room\" + 0.028*\"meeting\" + 0.026*\"conference\" + 0.024*\"greece\" + 0.018*\"provision\" + 0.018*\"cleaning\" + 0.018*\"italy\"\n", + "\n", + "-- Topic: 20 --\n", + "0.020*\"western\" + 0.019*\"youth\" + 0.018*\"vehicle\" + 0.017*\"voltage\" + 0.016*\"electric\" + 0.015*\"police\" + 0.014*\"balkan\" + 0.013*\"register\" + 0.013*\"authorise\" + 0.013*\"officer\"\n", + "\n", + "-- Topic: 21 --\n", + "0.038*\"gov\" + 0.026*\"clarification\" + 0.026*\"ipa\" + 0.026*\"reception\" + 0.026*\"tr\" + 0.026*\"email\" + 0.024*\"deadline\" + 0.024*\"submission\" + 0.023*\"etf\" + 0.023*\"address\"\n", + "\n", + "-- Topic: 22 --\n", + "0.044*\"ecb\" + 0.034*\"mobile\" + 0.025*\"ispra\" + 0.020*\"telecommunication\" + 0.019*\"supplier\" + 0.017*\"supply\" + 0.017*\"water\" + 0.016*\"joint\" + 0.015*\"distribution\" + 0.014*\"research\"\n", + "\n", + "-- Topic: 23 --\n", + "0.010*\"indicator\" + 0.010*\"esm\" + 0.010*\"datum\" + 0.009*\"account\" + 0.008*\"support\" + 0.008*\"report\" + 0.008*\"eu\" + 0.008*\"country\" + 0.007*\"partner\" + 0.007*\"service\"\n", + "\n", + "-- Topic: 24 --\n", + "0.095*\"tenderer\" + 0.076*\"guarantee\" + 0.047*\"contract\" + 0.040*\"provide\" + 0.027*\"signing\" + 0.027*\"ask\" + 0.027*\"fail\" + 0.027*\"cheap\" + 0.027*\"return\" + 0.026*\"later\"\n", + "\n", + "-- Topic: 25 --\n", + "0.014*\"study\" + 0.011*\"eu\" + 0.008*\"objective\" + 0.008*\"support\" + 0.008*\"policy\" + 0.007*\"evaluation\" + 0.007*\"impact\" + 0.007*\"assessment\" + 0.006*\"european\" + 0.006*\"development\"\n", + "\n", + "-- Topic: 26 --\n", + "0.059*\"construction\" + 0.040*\"work\" + 0.035*\"eib\" + 0.031*\"plant\" + 0.023*\"supervision\" + 0.022*\"road\" + 0.022*\"design\" + 0.022*\"rehabilitation\" + 0.018*\"km\" + 0.017*\"treatment\"\n", + "\n", + "-- Topic: 27 --\n", + "0.055*\"subject\" + 0.047*\"availability\" + 0.046*\"extend\" + 0.046*\"duration\" + 0.044*\"discretion\" + 0.042*\"funding\" + 0.036*\"scope\" + 0.032*\"satisfactory\" + 0.031*\"eur\" + 0.031*\"extension\"\n", + "\n", + "-- Topic: 28 --\n", + "0.040*\"directive\" + 0.023*\"regulation\" + 0.020*\"cable\" + 0.020*\"budget\" + 0.019*\"council\" + 0.019*\"ec\" + 0.017*\"gap\" + 0.012*\"rural\" + 0.012*\"eu\" + 0.012*\"air\"\n", + "\n", + "-- Topic: 29 --\n", + "0.076*\"security\" + 0.038*\"delegation\" + 0.038*\"system\" + 0.037*\"human\" + 0.035*\"resource\" + 0.035*\"guard\" + 0.029*\"necessary\" + 0.028*\"material\" + 0.026*\"alarm\" + 0.026*\"responsibility\"\n", + "\n", + "-- Topic: 30 --\n", + "0.076*\"eib\" + 0.024*\"project\" + 0.021*\"group\" + 0.020*\"management\" + 0.019*\"investment\" + 0.018*\"assistance\" + 0.017*\"service\" + 0.016*\"consultancy\" + 0.015*\"advisory\" + 0.014*\"water\"\n", + "\n", + "-- Topic: 31 --\n", + "0.079*\"insurance\" + 0.032*\"product\" + 0.026*\"modernisation\" + 0.025*\"promotional\" + 0.024*\"carbon\" + 0.023*\"chain\" + 0.021*\"animal\" + 0.021*\"item\" + 0.021*\"scheme\" + 0.018*\"pension\"\n", + "\n", + "-- Topic: 32 --\n", + "0.051*\"medical\" + 0.022*\"cascade\" + 0.020*\"park\" + 0.020*\"multiple\" + 0.019*\"car\" + 0.019*\"eif\" + 0.018*\"hire\" + 0.017*\"manufacturing\" + 0.016*\"provision\" + 0.016*\"beam\"\n", + "\n", + "-- Topic: 33 --\n", + "0.031*\"review\" + 0.029*\"peer\" + 0.025*\"morocco\" + 0.022*\"portfolio\" + 0.021*\"registration\" + 0.021*\"bNUMBER\" + 0.021*\"eige\" + 0.017*\"literature\" + 0.017*\"sensor\" + 0.016*\"foresight\"\n", + "\n", + "-- Topic: 34 --\n", + "0.038*\"cfsp\" + 0.026*\"video\" + 0.025*\"audio\" + 0.023*\"kosovo\" + 0.019*\"visual\" + 0.017*\"georgia\" + 0.017*\"special\" + 0.016*\"multimedia\" + 0.015*\"co\" + 0.015*\"sanitary\"\n", + "\n", + "-- Topic: 35 --\n", + "0.035*\"measurement\" + 0.029*\"feed\" + 0.025*\"light\" + 0.024*\"emission\" + 0.024*\"hydrogen\" + 0.023*\"vehicle\" + 0.022*\"big\" + 0.022*\"portable\" + 0.020*\"workplace\" + 0.020*\"liquid\"\n", + "\n", + "-- Topic: 36 --\n", + "0.048*\"de\" + 0.046*\"statistic\" + 0.037*\"statistical\" + 0.036*\"la\" + 0.021*\"fit\" + 0.020*\"paris\" + 0.020*\"du\" + 0.019*\"et\" + 0.019*\"le\" + 0.018*\"fusion\"\n", + "\n", + "-- Topic: 37 --\n", + "0.061*\"travel\" + 0.035*\"agency\" + 0.029*\"service\" + 0.021*\"provision\" + 0.018*\"desk\" + 0.015*\"annex\" + 0.014*\"fix\" + 0.012*\"evolution\" + 0.012*\"accommodation\" + 0.011*\"arrangement\"\n", + "\n", + "-- Topic: 38 --\n", + "0.063*\"financial\" + 0.063*\"expression\" + 0.057*\"interest\" + 0.030*\"europe\" + 0.029*\"fund\" + 0.026*\"party\" + 0.024*\"preparatory\" + 0.021*\"council\" + 0.021*\"leadership\" + 0.021*\"accounting\"\n", + "\n", + "-- Topic: 39 --\n", + "0.030*\"engineering\" + 0.023*\"ukraine\" + 0.020*\"server\" + 0.016*\"architectural\" + 0.013*\"bus\" + 0.012*\"press\" + 0.012*\"port\" + 0.012*\"municipal\" + 0.010*\"passenger\" + 0.010*\"medicine\"\n", + "\n", + "-- Topic: 40 --\n", + "0.034*\"emsa\" + 0.032*\"aviation\" + 0.028*\"emcdda\" + 0.028*\"crisis\" + 0.026*\"tunnel\" + 0.025*\"exercise\" + 0.022*\"corridor\" + 0.021*\"station\" + 0.019*\"el\" + 0.019*\"preliminary\"\n", + "\n", + "-- Topic: 41 --\n", + "0.042*\"catering\" + 0.026*\"maintenance\" + 0.026*\"geel\" + 0.025*\"canteen\" + 0.024*\"repair\" + 0.021*\"preventive\" + 0.020*\"council\" + 0.018*\"building\" + 0.018*\"jrc\" + 0.018*\"irmm\"\n", + "\n", + "-- Topic: 42 --\n", + "0.038*\"survey\" + 0.032*\"occupational\" + 0.031*\"serbia\" + 0.029*\"health\" + 0.021*\"fundamental\" + 0.018*\"medical\" + 0.016*\"matter\" + 0.016*\"datum\" + 0.015*\"migration\" + 0.014*\"firm\"\n", + "\n", + "-- Topic: 43 --\n", + "0.034*\"representation\" + 0.032*\"house\" + 0.026*\"temporary\" + 0.024*\"european\" + 0.023*\"information\" + 0.021*\"visitor\" + 0.018*\"space\" + 0.018*\"exhibition\" + 0.018*\"office\" + 0.018*\"parliament\"\n", + "\n", + "-- Topic: 44 --\n", + "0.072*\"device\" + 0.028*\"rental\" + 0.026*\"www\" + 0.025*\"verification\" + 0.023*\"rs\" + 0.022*\"woman\" + 0.022*\"hybrid\" + 0.020*\"procurement\" + 0.020*\"portugal\" + 0.019*\"interactive\"\n", + "\n", + "-- Topic: 45 --\n", + "0.026*\"database\" + 0.026*\"computer\" + 0.023*\"scientific\" + 0.023*\"laboratory\" + 0.020*\"permit\" + 0.017*\"instrument\" + 0.017*\"custom\" + 0.017*\"subscription\" + 0.015*\"mail\" + 0.013*\"analytical\"\n", + "\n", + "-- Topic: 46 --\n", + "0.036*\"audiovisual\" + 0.034*\"medium\" + 0.030*\"hospital\" + 0.027*\"supply\" + 0.027*\"consumable\" + 0.023*\"sale\" + 0.023*\"monitoring\" + 0.020*\"equipment\" + 0.016*\"installation\" + 0.014*\"purchase\"\n", + "\n", + "-- Topic: 47 --\n", + "0.042*\"address\" + 0.041*\"email\" + 0.038*\"submission\" + 0.038*\"deadline\" + 0.038*\"clarification\" + 0.037*\"following\" + 0.036*\"late\" + 0.033*\"europa\" + 0.032*\"point\" + 0.031*\"seek\"\n", + "\n", + "-- Topic: 48 --\n", + "0.025*\"interim\" + 0.019*\"framework\" + 0.019*\"gsa\" + 0.017*\"lot\" + 0.017*\"disease\" + 0.017*\"committee\" + 0.016*\"surveillance\" + 0.015*\"contract\" + 0.014*\"classification\" + 0.011*\"year\"\n", + "\n", + "-- Topic: 49 --\n", + "0.018*\"chemical\" + 0.017*\"defence\" + 0.016*\"service\" + 0.015*\"web\" + 0.014*\"technology\" + 0.013*\"provision\" + 0.013*\"framework\" + 0.011*\"environment\" + 0.011*\"eea\" + 0.010*\"digital\"\n", + "\n" + ] + } + ], + "source": [ + "lda_model_tfidf = gensim.models.LdaMulticore(tfidf_corpus, num_topics=50, id2word=dictionary, passes=20, workers=4,\n", + " minimum_probability=0.0)\n", + "for idx, topic in lda_model_tfidf.print_topics(-1):\n", + " print('-- Topic: {} --\\n{}\\n'.format(idx, topic))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Link between topics and CPVs" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 160, + "outputs": [ + { + "data": { + "text/plain": " title_texte 85 44 50 \\\n1 ipa supply equipment increase competitiveness... False True False \n3 provision language training service tender in... False False False \n4 service support eda helicopter portfolio main... False False False \n5 NUMBER cp op NUMBER pooling share cost non co... False False False \n6 edf supply transport household similar waste ... False False False \n\n 80 73 45 71 79 90 ... 03 24 43 19 \\\n1 False False False False False False ... False False False False \n3 True False False False False False ... False False False False \n4 True False False False False False ... False False False False \n5 False True False False False False ... False False False False \n6 False False True False False False ... False False False False \n\n 41 37 14 16 76 topic \n1 False False False False False 29 \n3 False False False False False 40 \n4 False False False False False 11 \n5 False False False False False 23 \n6 False False False False False 49 \n\n[5 rows x 47 columns]", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>title_texte</th>\n <th>85</th>\n <th>44</th>\n <th>50</th>\n <th>80</th>\n <th>73</th>\n <th>45</th>\n <th>71</th>\n <th>79</th>\n <th>90</th>\n <th>...</th>\n <th>03</th>\n <th>24</th>\n <th>43</th>\n <th>19</th>\n <th>41</th>\n <th>37</th>\n <th>14</th>\n <th>16</th>\n <th>76</th>\n <th>topic</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>ipa supply equipment increase competitiveness...</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>29</td>\n </tr>\n <tr>\n <th>3</th>\n <td>provision language training service tender in...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>40</td>\n </tr>\n <tr>\n <th>4</th>\n <td>service support eda helicopter portfolio main...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>11</td>\n </tr>\n <tr>\n <th>5</th>\n <td>NUMBER cp op NUMBER pooling share cost non co...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>23</td>\n </tr>\n <tr>\n <th>6</th>\n <td>edf supply transport household similar waste ...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>49</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 47 columns</p>\n</div>" + }, + "execution_count": 160, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def find_topic(text: str):\n", + " text_bow = dictionary.doc2bow(text.split(\" \"))\n", + " return sorted(lda_model[text_bow], key=lambda x: x[1])[-1][0]\n", + "\n", + "\n", + "df_topic = df_train.assign(topic=df_train['title_texte'].apply(find_topic))\n", + "df_topic.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Top CPVs per topic" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 161, + "outputs": [], + "source": [ + "cpv_labels = {\n", + " \"03\": \"Agricultural, farming, fishing, forestry and related products\",\n", + " \"09\": \"Petroleum products, fuel, electricity and other sources of energy\",\n", + " \"14\": \"Mining, basic metals and related products\",\n", + " \"15\": \"Food, beverages, tobacco and related products\",\n", + " \"16\": \"Agricultural machinery\",\n", + " \"18\": \"Clothing, footwear, luggage articles and accessories\",\n", + " \"19\": \"Leather and textile fabrics, plastic and rubber materials\",\n", + " \"22\": \"Printed matter and related products\",\n", + " \"24\": \"Chemical products\",\n", + " \"30\": \"Office and computing machinery, equipment and supplies except furniture and software packages\",\n", + " \"31\": \"Electrical machinery, apparatus, equipment and consumables; Lighting\",\n", + " \"32\": \"Radio, television, communication, telecommunication and related equipment\",\n", + " \"33\": \"Medical equipments, pharmaceuticals and personal care products\",\n", + " \"34\": \"Transport equipment and auxiliary products to transportation\",\n", + " \"35\": \"Security, fire-fighting, police and defence equipment\",\n", + " \"37\": \"Musical instruments, sport goods, games, toys, handicraft, art materials and accessories\",\n", + " \"38\": \"Laboratory, optical and precision equipments (excl. glasses)\",\n", + " \"39\": \"Furniture (incl. office furniture), furnishings, domestic appliances (excl. lighting) and cleaning products\",\n", + " \"41\": \"Collected and purified water\",\n", + " \"42\": \"Industrial machinery\",\n", + " \"43\": \"Machinery for mining, quarrying, construction equipment\",\n", + " \"44\": \"Construction structures and materials; auxiliary products to construction (excepts electric apparatus)\",\n", + " \"45\": \"Construction work\",\n", + " \"48\": \"Software package and information systems\",\n", + " \"50\": \"Repair and maintenance services\",\n", + " \"51\": \"Installation services (except software)\",\n", + " \"55\": \"Hotel, restaurant and retail trade services\",\n", + " \"60\": \"Transport services (excl. Waste transport)\",\n", + " \"63\": \"Supporting and auxiliary transport services; travel agencies services\",\n", + " \"64\": \"Postal and telecommunications services\",\n", + " \"65\": \"Public utilities\",\n", + " \"66\": \"Financial and insurance services\",\n", + " \"70\": \"Real estate services\",\n", + " \"71\": \"Architectural, construction, engineering and inspection services\",\n", + " \"72\": \"IT services: consulting, software development, Internet and support\",\n", + " \"73\": \"Research and development services and related consultancy services\",\n", + " \"75\": \"Administration, defence and social security services\",\n", + " \"76\": \"Services related to the oil and gas industry\",\n", + " \"77\": \"Agricultural, forestry, horticultural, aquacultural and apicultural services\",\n", + " \"79\": \"Business services: law, marketing, consulting, recruitment, printing and security\",\n", + " \"80\": \"Education and training services\",\n", + " \"85\": \"Health and social work services\",\n", + " \"90\": \"Sewage-, refuse-, cleaning-, and environmental services\",\n", + " \"92\": \"Recreational, cultural and sporting services\",\n", + " \"98\": \"Other community, social and personal services\",\n", + "}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 162, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "===\n", + "0 -> 0.091*\"service\" + 0.064*\"event\" + 0.032*\"medical\" + 0.031*\"travel\" + 0.028*\"provision\" + 0.026*\"organisation\" + 0.023*\"agency\" + 0.019*\"conference\" + 0.017*\"provide\" + 0.015*\"online\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 93)\n", + "('Supporting and auxiliary transport services; travel agencies services', 49)\n", + "('Health and social work services', 32)\n", + "===\n", + "1 -> 0.074*\"medium\" + 0.052*\"service\" + 0.046*\"european\" + 0.027*\"social\" + 0.026*\"parliament\" + 0.023*\"monitoring\" + 0.022*\"committee\" + 0.022*\"communication\" + 0.019*\"contract\" + 0.019*\"NUMBER\"\n", + "---\n", + "('Recreational, cultural and sporting services', 65)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 45)\n", + "('Research and development services and related consultancy services', 29)\n", + "===\n", + "2 -> 0.128*\"NUMBER\" + 0.056*\"directive\" + 0.051*\"regulation\" + 0.035*\"council\" + 0.033*\"eu\" + 0.030*\"ec\" + 0.029*\"european\" + 0.025*\"commission\" + 0.019*\"article\" + 0.018*\"parliament\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 66)\n", + "('Research and development services and related consultancy services', 30)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 30)\n", + "===\n", + "3 -> 0.031*\"system\" + 0.028*\"contract\" + 0.021*\"support\" + 0.019*\"iter\" + 0.015*\"study\" + 0.014*\"service\" + 0.013*\"provide\" + 0.013*\"development\" + 0.013*\"capability\" + 0.012*\"NUMBER\"\n", + "---\n", + "('Research and development services and related consultancy services', 73)\n", + "('Architectural, construction, engineering and inspection services', 47)\n", + "('Laboratory, optical and precision equipments (excl. glasses)', 19)\n", + "===\n", + "4 -> 0.053*\"service\" + 0.046*\"production\" + 0.045*\"document\" + 0.040*\"product\" + 0.030*\"language\" + 0.029*\"design\" + 0.024*\"material\" + 0.022*\"tender\" + 0.021*\"relate\" + 0.020*\"contract\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 103)\n", + "('IT services: consulting, software development, Internet and support', 27)\n", + "('Recreational, cultural and sporting services', 20)\n", + "===\n", + "5 -> 0.077*\"maintenance\" + 0.071*\"installation\" + 0.055*\"supply\" + 0.044*\"system\" + 0.023*\"NUMBER\" + 0.023*\"jrc\" + 0.016*\"equipment\" + 0.015*\"plant\" + 0.014*\"contract\" + 0.014*\"include\"\n", + "---\n", + "('Laboratory, optical and precision equipments (excl. glasses)', 82)\n", + "('Repair and maintenance services', 77)\n", + "('Construction work', 62)\n", + "===\n", + "6 -> 0.301*\"NUMBER\" + 0.250*\"lot\" + 0.040*\"divide\" + 0.034*\"supply\" + 0.018*\"contract\" + 0.017*\"tender\" + 0.012*\"electricity\" + 0.012*\"ecb\" + 0.011*\"main\" + 0.011*\"follow\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 38)\n", + "('Construction work', 12)\n", + "('IT services: consulting, software development, Internet and support', 11)\n", + "===\n", + "7 -> 0.118*\"training\" + 0.036*\"NUMBER\" + 0.028*\"programme\" + 0.021*\"course\" + 0.020*\"activity\" + 0.019*\"protection\" + 0.019*\"organisation\" + 0.016*\"civil\" + 0.013*\"contract\" + 0.013*\"initiative\"\n", + "---\n", + "('Education and training services', 135)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 33)\n", + "('Research and development services and related consultancy services', 18)\n", + "===\n", + "8 -> 0.081*\"NUMBER\" + 0.057*\"authority\" + 0.051*\"contracting\" + 0.046*\"address\" + 0.042*\"day\" + 0.040*\"point\" + 0.039*\"email\" + 0.038*\"seek\" + 0.037*\"submission\" + 0.037*\"deadline\"\n", + "---\n", + "('Architectural, construction, engineering and inspection services', 226)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 22)\n", + "('Administration, defence and social security services', 17)\n", + "===\n", + "9 -> 0.085*\"construction\" + 0.064*\"water\" + 0.056*\"work\" + 0.037*\"eib\" + 0.025*\"supervision\" + 0.025*\"project\" + 0.020*\"design\" + 0.018*\"treatment\" + 0.017*\"plant\" + 0.016*\"system\"\n", + "---\n", + "('Construction work', 201)\n", + "('Architectural, construction, engineering and inspection services', 111)\n", + "('Electrical machinery, apparatus, equipment and consumables; Lighting', 17)\n", + "===\n", + "10 -> 0.081*\"project\" + 0.065*\"technical\" + 0.061*\"assistance\" + 0.047*\"subject\" + 0.035*\"authority\" + 0.033*\"contract\" + 0.030*\"contracting\" + 0.029*\"extend\" + 0.028*\"scope\" + 0.028*\"availability\"\n", + "---\n", + "('Architectural, construction, engineering and inspection services', 206)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 59)\n", + "('Administration, defence and social security services', 32)\n", + "===\n", + "11 -> 0.208*\"service\" + 0.085*\"provision\" + 0.022*\"procedure\" + 0.021*\"consultancy\" + 0.021*\"procurement\" + 0.020*\"relate\" + 0.019*\"provider\" + 0.017*\"ict\" + 0.016*\"management\" + 0.015*\"related\"\n", + "---\n", + "('IT services: consulting, software development, Internet and support', 134)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 97)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 30)\n", + "===\n", + "12 -> 0.099*\"service\" + 0.049*\"european\" + 0.041*\"contract\" + 0.037*\"tender\" + 0.036*\"company\" + 0.035*\"framework\" + 0.034*\"eea\" + 0.033*\"security\" + 0.033*\"union\" + 0.033*\"NUMBER\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 104)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 43)\n", + "('Supporting and auxiliary transport services; travel agencies services', 11)\n", + "===\n", + "13 -> 0.077*\"translation\" + 0.035*\"european\" + 0.035*\"union\" + 0.035*\"french\" + 0.032*\"body\" + 0.029*\"centre\" + 0.029*\"english\" + 0.025*\"german\" + 0.023*\"property\" + 0.023*\"framework\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 96)\n", + "('Education and training services', 6)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 1)\n", + "===\n", + "14 -> 0.054*\"emergency\" + 0.049*\"ipa\" + 0.026*\"woman\" + 0.023*\"georgia\" + 0.021*\"turkey\" + 0.020*\"country\" + 0.020*\"south\" + 0.018*\"response\" + 0.018*\"assistance\" + 0.018*\"negotiation\"\n", + "---\n", + "('Architectural, construction, engineering and inspection services', 11)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 5)\n", + "('Medical equipments, pharmaceuticals and personal care products', 5)\n", + "===\n", + "15 -> 0.042*\"skill\" + 0.041*\"industrial\" + 0.032*\"labour\" + 0.029*\"improve\" + 0.025*\"rail\" + 0.021*\"market\" + 0.018*\"housing\" + 0.018*\"reporting\" + 0.016*\"purchase\" + 0.015*\"register\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 13)\n", + "('Research and development services and related consultancy services', 12)\n", + "('Architectural, construction, engineering and inspection services', 9)\n", + "===\n", + "16 -> 0.169*\"equipment\" + 0.089*\"supply\" + 0.064*\"insurance\" + 0.031*\"purchase\" + 0.027*\"computer\" + 0.027*\"installation\" + 0.023*\"service\" + 0.022*\"furniture\" + 0.021*\"hospital\" + 0.020*\"accident\"\n", + "---\n", + "('Financial and insurance services', 32)\n", + "('Office and computing machinery, equipment and supplies except furniture and software packages', 28)\n", + "('Medical equipments, pharmaceuticals and personal care products', 26)\n", + "===\n", + "17 -> 0.037*\"project\" + 0.030*\"sme\" + 0.029*\"helpdesk\" + 0.025*\"payment\" + 0.022*\"financing\" + 0.022*\"pilot\" + 0.022*\"enterprise\" + 0.021*\"development\" + 0.016*\"high\" + 0.015*\"service\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 44)\n", + "('Financial and insurance services', 16)\n", + "('Research and development services and related consultancy services', 11)\n", + "===\n", + "18 -> 0.070*\"study\" + 0.051*\"eu\" + 0.043*\"analysis\" + 0.023*\"european\" + 0.021*\"provide\" + 0.020*\"market\" + 0.019*\"information\" + 0.015*\"product\" + 0.014*\"feasibility\" + 0.013*\"NUMBER\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 95)\n", + "('Research and development services and related consultancy services', 68)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 20)\n", + "===\n", + "19 -> 0.059*\"risk\" + 0.043*\"food\" + 0.043*\"assessment\" + 0.035*\"safety\" + 0.027*\"efsa\" + 0.024*\"health\" + 0.022*\"laboratory\" + 0.022*\"scientific\" + 0.020*\"chemical\" + 0.020*\"procurement\"\n", + "---\n", + "('Research and development services and related consultancy services', 88)\n", + "('Health and social work services', 24)\n", + "('Architectural, construction, engineering and inspection services', 17)\n", + "===\n", + "20 -> 0.088*\"development\" + 0.069*\"system\" + 0.059*\"support\" + 0.054*\"maintenance\" + 0.046*\"software\" + 0.036*\"information\" + 0.031*\"service\" + 0.030*\"management\" + 0.023*\"solution\" + 0.022*\"application\"\n", + "---\n", + "('IT services: consulting, software development, Internet and support', 185)\n", + "('Software package and information systems', 52)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 27)\n", + "===\n", + "21 -> 0.116*\"service\" + 0.115*\"framework\" + 0.107*\"contract\" + 0.081*\"provision\" + 0.036*\"staff\" + 0.027*\"office\" + 0.023*\"multiple\" + 0.019*\"conclude\" + 0.018*\"tender\" + 0.016*\"agency\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 173)\n", + "('IT services: consulting, software development, Internet and support', 37)\n", + "('Research and development services and related consultancy services', 30)\n", + "===\n", + "22 -> 0.096*\"contract\" + 0.063*\"framework\" + 0.055*\"NUMBER\" + 0.053*\"year\" + 0.026*\"lot\" + 0.025*\"order\" + 0.024*\"award\" + 0.021*\"maximum\" + 0.021*\"contractor\" + 0.021*\"conclusion\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 49)\n", + "('IT services: consulting, software development, Internet and support', 6)\n", + "('Construction work', 4)\n", + "===\n", + "23 -> 0.032*\"NUMBER\" + 0.029*\"member\" + 0.028*\"eu\" + 0.026*\"state\" + 0.022*\"study\" + 0.015*\"national\" + 0.013*\"cost\" + 0.013*\"use\" + 0.013*\"provide\" + 0.012*\"objective\"\n", + "---\n", + "('Research and development services and related consultancy services', 150)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 94)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 59)\n", + "===\n", + "24 -> 0.045*\"health\" + 0.027*\"mobile\" + 0.024*\"occupational\" + 0.023*\"radio\" + 0.021*\"statistical\" + 0.019*\"telecommunication\" + 0.018*\"transmission\" + 0.015*\"supply\" + 0.015*\"multi\" + 0.015*\"measurement\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 23)\n", + "('Postal and telecommunications services', 7)\n", + "('Laboratory, optical and precision equipments (excl. glasses)', 6)\n", + "===\n", + "25 -> 0.136*\"security\" + 0.072*\"service\" + 0.045*\"system\" + 0.044*\"provide\" + 0.040*\"delegation\" + 0.038*\"european\" + 0.036*\"union\" + 0.033*\"resource\" + 0.028*\"human\" + 0.025*\"provision\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 138)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 9)\n", + "('Supporting and auxiliary transport services; travel agencies services', 8)\n", + "===\n", + "26 -> 0.056*\"support\" + 0.039*\"policy\" + 0.035*\"eu\" + 0.024*\"environmental\" + 0.022*\"implementation\" + 0.021*\"commission\" + 0.020*\"contract\" + 0.019*\"provide\" + 0.018*\"development\" + 0.015*\"indicator\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 103)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 75)\n", + "('Research and development services and related consultancy services', 47)\n", + "===\n", + "27 -> 0.078*\"vehicle\" + 0.046*\"court\" + 0.044*\"european\" + 0.041*\"test\" + 0.032*\"justice\" + 0.032*\"emission\" + 0.031*\"union\" + 0.023*\"car\" + 0.023*\"supply\" + 0.023*\"fuel\"\n", + "---\n", + "('Transport equipment and auxiliary products to transportation', 22)\n", + "('Architectural, construction, engineering and inspection services', 11)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 9)\n", + "===\n", + "28 -> 0.166*\"energy\" + 0.050*\"house\" + 0.044*\"service\" + 0.043*\"eu\" + 0.037*\"reception\" + 0.031*\"efficiency\" + 0.030*\"tender\" + 0.028*\"renewable\" + 0.027*\"premise\" + 0.020*\"security\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 40)\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 19)\n", + "('Research and development services and related consultancy services', 10)\n", + "===\n", + "29 -> 0.128*\"contract\" + 0.087*\"tenderer\" + 0.066*\"guarantee\" + 0.064*\"provide\" + 0.044*\"NUMBER\" + 0.023*\"return\" + 0.022*\"period\" + 0.022*\"later\" + 0.022*\"performance\" + 0.022*\"receive\"\n", + "---\n", + "('Construction work', 71)\n", + "('Office and computing machinery, equipment and supplies except furniture and software packages', 56)\n", + "('Architectural, construction, engineering and inspection services', 55)\n", + "===\n", + "30 -> 0.045*\"NUMBER\" + 0.041*\"supply\" + 0.033*\"room\" + 0.032*\"gas\" + 0.023*\"galileo\" + 0.021*\"fuel\" + 0.020*\"use\" + 0.018*\"satellite\" + 0.016*\"natural\" + 0.014*\"heating\"\n", + "---\n", + "('Petroleum products, fuel, electricity and other sources of energy', 23)\n", + "('Laboratory, optical and precision equipments (excl. glasses)', 15)\n", + "('Research and development services and related consultancy services', 11)\n", + "===\n", + "31 -> 0.125*\"datum\" + 0.023*\"collection\" + 0.023*\"review\" + 0.022*\"eu\" + 0.018*\"data\" + 0.017*\"country\" + 0.014*\"approach\" + 0.013*\"research\" + 0.012*\"study\" + 0.012*\"objective\"\n", + "---\n", + "('Research and development services and related consultancy services', 59)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 43)\n", + "('IT services: consulting, software development, Internet and support', 28)\n", + "===\n", + "32 -> 0.042*\"support\" + 0.029*\"task\" + 0.025*\"provide\" + 0.025*\"coordination\" + 0.021*\"specific\" + 0.019*\"contract\" + 0.017*\"technical\" + 0.017*\"service\" + 0.016*\"ecdc\" + 0.016*\"video\"\n", + "---\n", + "('Research and development services and related consultancy services', 28)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 28)\n", + "('Health and social work services', 17)\n", + "===\n", + "33 -> 0.055*\"evaluation\" + 0.045*\"impact\" + 0.044*\"assessment\" + 0.037*\"study\" + 0.030*\"NUMBER\" + 0.028*\"economic\" + 0.024*\"policy\" + 0.023*\"contract\" + 0.022*\"analysis\" + 0.015*\"gender\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 73)\n", + "('Research and development services and related consultancy services', 43)\n", + "('Agricultural, forestry, horticultural, aquacultural and apicultural services', 11)\n", + "===\n", + "34 -> 0.108*\"contract\" + 0.075*\"tenderer\" + 0.052*\"bond\" + 0.045*\"successful\" + 0.045*\"NUMBER\" + 0.040*\"sign\" + 0.036*\"provide\" + 0.036*\"require\" + 0.022*\"price\" + 0.021*\"time\"\n", + "---\n", + "('Office and computing machinery, equipment and supplies except furniture and software packages', 26)\n", + "('Construction work', 24)\n", + "('Transport equipment and auxiliary products to transportation', 19)\n", + "===\n", + "35 -> 0.071*\"service\" + 0.058*\"eib\" + 0.049*\"management\" + 0.040*\"ecb\" + 0.032*\"consultancy\" + 0.028*\"provision\" + 0.026*\"support\" + 0.024*\"bank\" + 0.021*\"group\" + 0.019*\"framework\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 118)\n", + "('IT services: consulting, software development, Internet and support', 49)\n", + "('Architectural, construction, engineering and inspection services', 35)\n", + "===\n", + "36 -> 0.133*\"NUMBER\" + 0.030*\"audit\" + 0.027*\"general\" + 0.024*\"financial\" + 0.023*\"eu\" + 0.022*\"service\" + 0.021*\"contract\" + 0.020*\"gsa\" + 0.020*\"MONTH\" + 0.018*\"budget\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 29)\n", + "('Architectural, construction, engineering and inspection services', 14)\n", + "('Research and development services and related consultancy services', 13)\n", + "===\n", + "37 -> 0.055*\"infrastructure\" + 0.055*\"operation\" + 0.052*\"access\" + 0.030*\"database\" + 0.026*\"support\" + 0.024*\"investment\" + 0.020*\"implementation\" + 0.019*\"finance\" + 0.016*\"subscription\" + 0.016*\"platform\"\n", + "---\n", + "('Architectural, construction, engineering and inspection services', 34)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 17)\n", + "('IT services: consulting, software development, Internet and support', 13)\n", + "===\n", + "38 -> 0.135*\"NUMBER\" + 0.077*\"road\" + 0.069*\"work\" + 0.041*\"construction\" + 0.037*\"km\" + 0.028*\"railway\" + 0.024*\"rehabilitation\" + 0.022*\"line\" + 0.018*\"section\" + 0.017*\"safety\"\n", + "---\n", + "('Construction work', 88)\n", + "('Architectural, construction, engineering and inspection services', 39)\n", + "('Research and development services and related consultancy services', 6)\n", + "===\n", + "39 -> 0.096*\"european\" + 0.078*\"building\" + 0.057*\"parliament\" + 0.039*\"brussels\" + 0.038*\"service\" + 0.030*\"contract\" + 0.027*\"luxembourg\" + 0.022*\"NUMBER\" + 0.021*\"strasbourg\" + 0.019*\"work\"\n", + "---\n", + "('Construction work', 87)\n", + "('Architectural, construction, engineering and inspection services', 66)\n", + "('Repair and maintenance services', 59)\n", + "===\n", + "40 -> 0.056*\"administrative\" + 0.055*\"legal\" + 0.053*\"law\" + 0.050*\"advice\" + 0.039*\"sea\" + 0.027*\"service\" + 0.026*\"different\" + 0.021*\"cultural\" + 0.020*\"assistance\" + 0.018*\"learning\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 20)\n", + "('Architectural, construction, engineering and inspection services', 6)\n", + "('Education and training services', 3)\n", + "===\n", + "41 -> 0.035*\"climate\" + 0.033*\"copernicus\" + 0.032*\"service\" + 0.030*\"NUMBER\" + 0.028*\"support\" + 0.027*\"change\" + 0.022*\"global\" + 0.022*\"activity\" + 0.019*\"land\" + 0.015*\"monitoring\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 72)\n", + "('Research and development services and related consultancy services', 57)\n", + "('IT services: consulting, software development, Internet and support', 54)\n", + "===\n", + "42 -> 0.074*\"service\" + 0.069*\"european\" + 0.054*\"communication\" + 0.049*\"commission\" + 0.029*\"information\" + 0.023*\"public\" + 0.022*\"representation\" + 0.021*\"provision\" + 0.019*\"activity\" + 0.016*\"provide\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 119)\n", + "('Postal and telecommunications services', 37)\n", + "('IT services: consulting, software development, Internet and support', 27)\n", + "===\n", + "43 -> 0.081*\"european\" + 0.077*\"network\" + 0.067*\"education\" + 0.038*\"school\" + 0.035*\"support\" + 0.023*\"commission\" + 0.018*\"high\" + 0.018*\"programme\" + 0.014*\"research\" + 0.013*\"procure\"\n", + "---\n", + "('Education and training services', 30)\n", + "('Architectural, construction, engineering and inspection services', 21)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 17)\n", + "===\n", + "44 -> 0.090*\"application\" + 0.087*\"form\" + 0.045*\"section\" + 0.045*\"standard\" + 0.045*\"NUMBER\" + 0.044*\"candidate\" + 0.043*\"situation\" + 0.043*\"submit\" + 0.042*\"sign\" + 0.042*\"include\"\n", + "---\n", + "('Architectural, construction, engineering and inspection services', 405)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 205)\n", + "('Administration, defence and social security services', 129)\n", + "===\n", + "45 -> 0.105*\"transport\" + 0.066*\"survey\" + 0.038*\"european\" + 0.038*\"international\" + 0.030*\"statistic\" + 0.029*\"aviation\" + 0.022*\"safety\" + 0.021*\"level\" + 0.018*\"service\" + 0.018*\"agency\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 40)\n", + "('Research and development services and related consultancy services', 18)\n", + "('IT services: consulting, software development, Internet and support', 8)\n", + "===\n", + "46 -> 0.074*\"site\" + 0.069*\"jrc\" + 0.044*\"centre\" + 0.042*\"ispra\" + 0.036*\"joint\" + 0.034*\"research\" + 0.028*\"building\" + 0.027*\"service\" + 0.025*\"work\" + 0.024*\"NUMBER\"\n", + "---\n", + "('Construction work', 49)\n", + "('Architectural, construction, engineering and inspection services', 36)\n", + "('Repair and maintenance services', 19)\n", + "===\n", + "47 -> 0.046*\"NUMBER\" + 0.035*\"service\" + 0.030*\"device\" + 0.029*\"equipment\" + 0.022*\"time\" + 0.022*\"frontex\" + 0.020*\"acquisition\" + 0.019*\"provision\" + 0.017*\"supply\" + 0.016*\"purchase\"\n", + "---\n", + "('IT services: consulting, software development, Internet and support', 23)\n", + "('Office and computing machinery, equipment and supplies except furniture and software packages', 20)\n", + "('Repair and maintenance services', 18)\n", + "===\n", + "48 -> 0.030*\"study\" + 0.022*\"digital\" + 0.021*\"practice\" + 0.021*\"sector\" + 0.020*\"market\" + 0.020*\"eu\" + 0.019*\"good\" + 0.019*\"innovation\" + 0.018*\"objective\" + 0.017*\"smart\"\n", + "---\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 182)\n", + "('Research and development services and related consultancy services', 124)\n", + "('IT services: consulting, software development, Internet and support', 24)\n", + "===\n", + "49 -> 0.075*\"waste\" + 0.050*\"epo\" + 0.046*\"right\" + 0.043*\"nuclear\" + 0.042*\"allow\" + 0.041*\"de\" + 0.035*\"reduce\" + 0.035*\"safeguard\" + 0.034*\"proposal\" + 0.030*\"effect\"\n", + "---\n", + "('Sewage-, refuse-, cleaning-, and environmental services', 21)\n", + "('Business services: law, marketing, consulting, recruitment, printing and security', 18)\n", + "('Construction work', 14)\n" + ] + } + ], + "source": [ + "for idx, topic in lda_model.print_topics(-1):\n", + " print(\"===\")\n", + " print(idx, \"->\", topic)\n", + " print(\"---\")\n", + " df_filtered = df_topic[df_topic[\"topic\"] == idx]\n", + " df_filtered = df_filtered.drop([\"title_texte\", \"topic\"], axis=1)\n", + " count_per_cpv = {cpv_labels[c]: df_filtered[c].sum() for c in df_filtered.columns}\n", + " top_cpvs = sorted(count_per_cpv.items(), key=lambda x: x[1], reverse=True)\n", + " for j in range(3):\n", + " print(top_cpvs[j])" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Evaluation of CPV classification" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 163, + "outputs": [], + "source": [ + "from scipy.stats import entropy\n", + "import numpy as np\n", + "\n", + "\n", + "# https://github.com/soberbichler/Using-LDA-and-Jensen-Shannon-distance-to-separate-relevant-from-non-relevant-articles/blob/master/news_article_similarity_remigration_notebook.ipynb\n", + "def jensen_shannon(query, matrix):\n", + " p = query[None, :].T\n", + " q = matrix.T\n", + " m = 0.5 * (p + q)\n", + " return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))\n", + "\n", + "\n", + "def get_doc_similarities(query, matrix):\n", + " sims = jensen_shannon(query, matrix) # list of jensen shannon distances\n", + " return sims" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 164, + "outputs": [], + "source": [ + "corpus = df_train[\"title_texte\"].apply(lambda x: dictionary.doc2bow(x.split(\" \")))\n", + "all_dists = np.stack([np.array([tup[1] for tup in lst]) for lst in lda_model[list(corpus)]])" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### With bag-of-words" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 165, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 1%| | 20/2912 [00:01<02:35, 18.59it/s]/tmp/ipykernel_24919/3601534538.py:10: RuntimeWarning: invalid value encountered in sqrt\n", + " return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))\n", + "100%|██████████| 2912/2912 [02:47<00:00, 17.42it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Health and social work services... | 85 0.03 0.08 0.04 62\n", + "Construction structures and materials; a... | 44 0.01 0.04 0.02 23\n", + " Repair and maintenance services... | 50 0.04 0.10 0.05 68\n", + " Education and training services... | 80 0.02 0.05 0.03 99\n", + "Research and development services and re... | 73 0.08 0.22 0.12 249\n", + " Construction work... | 45 0.08 0.19 0.12 191\n", + "Architectural, construction, engineering... | 71 0.13 0.32 0.19 404\n", + "Business services: law, marketing, consu... | 79 0.24 0.54 0.34 688\n", + "Sewage-, refuse-, cleaning-, and environ... | 90 0.05 0.11 0.07 176\n", + "Office and computing machinery, equipmen... | 30 0.03 0.07 0.04 70\n", + "Security, fire-fighting, police and defe... | 35 0.04 0.08 0.05 36\n", + "Medical equipments, pharmaceuticals and ... | 33 0.01 0.03 0.01 38\n", + "Hotel, restaurant and retail trade servi... | 55 0.03 0.07 0.04 30\n", + "IT services: consulting, software develo... | 72 0.06 0.19 0.09 197\n", + "Software package and information systems... | 48 0.01 0.05 0.02 43\n", + "Laboratory, optical and precision equipm... | 38 0.03 0.10 0.05 61\n", + "Petroleum products, fuel, electricity an... | 09 0.01 0.03 0.02 34\n", + "Administration, defence and social secur... | 75 0.03 0.08 0.04 77\n", + " Financial and insurance services... | 66 0.01 0.04 0.02 46\n", + " Postal and telecommunications services... | 64 0.01 0.03 0.01 37\n", + " Industrial machinery... | 42 0.00 0.00 0.00 27\n", + "Transport equipment and auxiliary produc... | 34 0.02 0.06 0.03 50\n", + "Transport services (excl. Waste transpor... | 60 0.03 0.05 0.03 41\n", + "Recreational, cultural and sporting serv... | 92 0.01 0.02 0.01 43\n", + "Furniture (incl. office furniture), furn... | 39 0.01 0.04 0.02 49\n", + "Electrical machinery, apparatus, equipme... | 31 0.00 0.00 0.00 36\n", + "Other community, social and personal ser... | 98 0.00 0.00 0.00 35\n", + " Installation services (except software)... | 51 0.00 0.00 0.00 8\n", + "Radio, television, communication, teleco... | 32 0.03 0.07 0.04 40\n", + " Public utilities... | 65 0.00 0.00 0.00 15\n", + "Agricultural, forestry, horticultural, a... | 77 0.00 0.00 0.00 22\n", + " Printed matter and related products... | 22 0.00 0.00 0.00 20\n", + "Supporting and auxiliary transport servi... | 63 0.00 0.00 0.00 34\n", + "Food, beverages, tobacco and related pro... | 15 0.03 0.20 0.06 5\n", + " Real estate services... | 70 0.00 0.00 0.00 14\n", + "Clothing, footwear, luggage articles and... | 18 0.00 0.00 0.00 5\n", + "Agricultural, farming, fishing, forestry... | 03 0.00 0.00 0.00 5\n", + " Chemical products... | 24 0.00 0.00 0.00 11\n", + "Machinery for mining, quarrying, constru... | 43 0.00 0.00 0.00 1\n", + "Leather and textile fabrics, plastic and... | 19 0.00 0.00 0.00 5\n", + " Collected and purified water... | 41 0.00 0.00 0.00 2\n", + "Musical instruments, sport goods, games,... | 37 0.00 0.00 0.00 3\n", + "Mining, basic metals and related product... | 14 0.00 0.00 0.00 6\n", + " Agricultural machinery... | 16 0.00 0.00 0.00 4\n", + "Services related to the oil and gas indu... | 76 0.00 0.00 0.00 3\n", + "\n", + " micro avg 0.09 0.23 0.13 3113\n", + " macro avg 0.02 0.06 0.04 3113\n", + " weighted avg 0.10 0.23 0.14 3113\n", + " samples avg 0.10 0.23 0.13 3113\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ferreni/Projects/TED AI/Repositories/tedai-cpv-classification/venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "from sklearn.metrics import classification_report\n", + "\n", + "y_true = []\n", + "y_pred = []\n", + "notices = list(df_test.iloc)\n", + "for notice in tqdm(notices, total=len(notices)):\n", + " notice_bow = dictionary.doc2bow(notice[\"title_texte\"].split(\" \"))\n", + " dist = np.array([tup[1] for tup in lda_model.get_document_topics(bow=notice_bow)])\n", + " sims = get_doc_similarities(dist, all_dists)\n", + " most_sim_ids = sims.argsort()[:3] # the top k positional index of the smallest Jensen Shannon distances\n", + " most_similar_df = df_train[df_train.index.isin(most_sim_ids)]\n", + " y_true.append([int(notice[c] == True) for c in cpvs])\n", + " y_pred.append([int(most_similar_df[c].sum() > 0) for c in cpvs])\n", + "print(classification_report(y_true, y_pred, target_names=[f\"{cpv_labels[c][:40]}... | {c}\" for c in cpvs]))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### With TFIDF" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 166, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 1%| | 34/2912 [00:02<03:32, 13.52it/s]/tmp/ipykernel_24919/3601534538.py:10: RuntimeWarning: invalid value encountered in sqrt\n", + " return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))\n", + "100%|██████████| 2912/2912 [03:00<00:00, 16.11it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Health and social work services... | 85 0.01 0.02 0.01 62\n", + "Construction structures and materials; a... | 44 0.00 0.00 0.00 23\n", + " Repair and maintenance services... | 50 0.02 0.04 0.03 68\n", + " Education and training services... | 80 0.04 0.07 0.05 99\n", + "Research and development services and re... | 73 0.08 0.20 0.11 249\n", + " Construction work... | 45 0.04 0.09 0.05 191\n", + "Architectural, construction, engineering... | 71 0.11 0.27 0.15 404\n", + "Business services: law, marketing, consu... | 79 0.22 0.49 0.30 688\n", + "Sewage-, refuse-, cleaning-, and environ... | 90 0.02 0.03 0.02 176\n", + "Office and computing machinery, equipmen... | 30 0.02 0.06 0.03 70\n", + "Security, fire-fighting, police and defe... | 35 0.01 0.03 0.02 36\n", + "Medical equipments, pharmaceuticals and ... | 33 0.02 0.05 0.03 38\n", + "Hotel, restaurant and retail trade servi... | 55 0.00 0.00 0.00 30\n", + "IT services: consulting, software develo... | 72 0.07 0.26 0.11 197\n", + "Software package and information systems... | 48 0.01 0.02 0.01 43\n", + "Laboratory, optical and precision equipm... | 38 0.01 0.07 0.02 61\n", + "Petroleum products, fuel, electricity an... | 09 0.02 0.03 0.02 34\n", + "Administration, defence and social secur... | 75 0.03 0.09 0.04 77\n", + " Financial and insurance services... | 66 0.01 0.02 0.01 46\n", + " Postal and telecommunications services... | 64 0.02 0.03 0.02 37\n", + " Industrial machinery... | 42 0.00 0.00 0.00 27\n", + "Transport equipment and auxiliary produc... | 34 0.01 0.02 0.02 50\n", + "Transport services (excl. Waste transpor... | 60 0.00 0.00 0.00 41\n", + "Recreational, cultural and sporting serv... | 92 0.00 0.00 0.00 43\n", + "Furniture (incl. office furniture), furn... | 39 0.03 0.12 0.05 49\n", + "Electrical machinery, apparatus, equipme... | 31 0.00 0.00 0.00 36\n", + "Other community, social and personal ser... | 98 0.02 0.03 0.02 35\n", + " Installation services (except software)... | 51 0.00 0.00 0.00 8\n", + "Radio, television, communication, teleco... | 32 0.02 0.07 0.03 40\n", + " Public utilities... | 65 0.00 0.00 0.00 15\n", + "Agricultural, forestry, horticultural, a... | 77 0.00 0.00 0.00 22\n", + " Printed matter and related products... | 22 0.00 0.00 0.00 20\n", + "Supporting and auxiliary transport servi... | 63 0.00 0.00 0.00 34\n", + "Food, beverages, tobacco and related pro... | 15 0.00 0.00 0.00 5\n", + " Real estate services... | 70 0.00 0.00 0.00 14\n", + "Clothing, footwear, luggage articles and... | 18 0.00 0.00 0.00 5\n", + "Agricultural, farming, fishing, forestry... | 03 0.00 0.00 0.00 5\n", + " Chemical products... | 24 0.00 0.00 0.00 11\n", + "Machinery for mining, quarrying, constru... | 43 0.00 0.00 0.00 1\n", + "Leather and textile fabrics, plastic and... | 19 0.00 0.00 0.00 5\n", + " Collected and purified water... | 41 0.00 0.00 0.00 2\n", + "Musical instruments, sport goods, games,... | 37 0.00 0.00 0.00 3\n", + "Mining, basic metals and related product... | 14 0.00 0.00 0.00 6\n", + " Agricultural machinery... | 16 0.00 0.00 0.00 4\n", + "Services related to the oil and gas indu... | 76 0.00 0.00 0.00 3\n", + "\n", + " micro avg 0.08 0.20 0.11 3113\n", + " macro avg 0.02 0.05 0.03 3113\n", + " weighted avg 0.08 0.20 0.11 3113\n", + " samples avg 0.08 0.20 0.11 3113\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ferreni/Projects/TED AI/Repositories/tedai-cpv-classification/venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/home/ferreni/Projects/TED AI/Repositories/tedai-cpv-classification/venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "from sklearn.metrics import classification_report\n", + "\n", + "y_true = []\n", + "y_pred = []\n", + "notices = list(df_test.iloc)\n", + "for notice in tqdm(notices, total=len(notices)):\n", + " notice_bow = dictionary.doc2bow(notice[\"title_texte\"].split(\" \"))\n", + " dist = np.array([tup[1] for tup in lda_model_tfidf.get_document_topics(bow=notice_bow)])\n", + " sims = get_doc_similarities(dist, all_dists)\n", + " most_sim_ids = sims.argsort()[:3] # the top k positional index of the smallest Jensen Shannon distances\n", + " most_similar_df = df_train[df_train.index.isin(most_sim_ids)]\n", + " y_true.append([int(notice[c] == True) for c in cpvs])\n", + " y_pred.append([int(most_similar_df[c].sum() > 0) for c in cpvs])\n", + "print(classification_report(y_true, y_pred, target_names=[f\"{cpv_labels[c][:40]}... | {c}\" for c in cpvs]))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/20230327-unsupervised-clustering-lda-coherence.ipynb b/20230327-unsupervised-clustering-lda-coherence.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3e509895371abb0145395eef454b623fc568beea --- /dev/null +++ b/20230327-unsupervised-clustering-lda-coherence.ipynb @@ -0,0 +1,460 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Unsupervised clustering - find best LDA model based on coherence\n", + "\n", + "## Dataset loading" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "data": { + "text/plain": " title_texte 85 44 50 \\\n1 ipa supply equipment increase competitiveness... False True False \n3 provision language training service tender in... False False False \n4 service support eda helicopter portfolio main... False False False \n5 NUMBER cp op NUMBER pooling share cost non co... False False False \n6 edf supply transport household similar waste ... False False False \n\n 80 73 45 71 79 90 ... 18 03 24 43 \\\n1 False False False False False False ... False False False False \n3 True False False False False False ... False False False False \n4 True False False False False False ... False False False False \n5 False True False False False False ... False False False False \n6 False False True False False False ... False False False False \n\n 19 41 37 14 16 76 \n1 False False False False False False \n3 False False False False False False \n4 False False False False False False \n5 False False False False False False \n6 False False False False False False \n\n[5 rows x 46 columns]", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>title_texte</th>\n <th>85</th>\n <th>44</th>\n <th>50</th>\n <th>80</th>\n <th>73</th>\n <th>45</th>\n <th>71</th>\n <th>79</th>\n <th>90</th>\n <th>...</th>\n <th>18</th>\n <th>03</th>\n <th>24</th>\n <th>43</th>\n <th>19</th>\n <th>41</th>\n <th>37</th>\n <th>14</th>\n <th>16</th>\n <th>76</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>ipa supply equipment increase competitiveness...</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>3</th>\n <td>provision language training service tender in...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>4</th>\n <td>service support eda helicopter portfolio main...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>5</th>\n <td>NUMBER cp op NUMBER pooling share cost non co...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>6</th>\n <td>edf supply transport household similar waste ...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 46 columns</p>\n</div>" + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"20230214-dataset_preprocessed_with_lemma.csv\", index_col=0)\n", + "df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "data": { + "text/plain": "((11647, 46), (2912, 46))" + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "cpvs = [c for c in df.columns if len(c) == 2]\n", + "df_train, df_test = train_test_split(df, test_size=0.2, shuffle=False)\n", + "(df_train.shape, df_test.shape)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": "{'85': 256,\n '44': 103,\n '50': 297,\n '80': 403,\n '73': 1067,\n '45': 731,\n '71': 1621,\n '79': 2682,\n '90': 629,\n '30': 266,\n '35': 145,\n '33': 158,\n '55': 117,\n '72': 914,\n '48': 199,\n '38': 289,\n '09': 128,\n '75': 277,\n '66': 206,\n '64': 148,\n '42': 159,\n '34': 199,\n '60': 122,\n '92': 169,\n '39': 188,\n '31': 139,\n '98': 123,\n '51': 50,\n '32': 185,\n '65': 29,\n '77': 83,\n '22': 61,\n '63': 144,\n '15': 43,\n '70': 44,\n '18': 35,\n '03': 31,\n '24': 30,\n '43': 17,\n '19': 7,\n '41': 13,\n '37': 13,\n '14': 16,\n '16': 5,\n '76': 5}" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{c: df_train[c].sum() for c in cpvs}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "{'85': 62,\n '44': 23,\n '50': 68,\n '80': 99,\n '73': 249,\n '45': 191,\n '71': 404,\n '79': 688,\n '90': 176,\n '30': 70,\n '35': 36,\n '33': 38,\n '55': 30,\n '72': 197,\n '48': 43,\n '38': 61,\n '09': 34,\n '75': 77,\n '66': 46,\n '64': 37,\n '42': 27,\n '34': 50,\n '60': 41,\n '92': 43,\n '39': 49,\n '31': 36,\n '98': 35,\n '51': 8,\n '32': 40,\n '65': 15,\n '77': 22,\n '22': 20,\n '63': 34,\n '15': 5,\n '70': 14,\n '18': 5,\n '03': 5,\n '24': 11,\n '43': 1,\n '19': 5,\n '41': 2,\n '37': 3,\n '14': 6,\n '16': 4,\n '76': 3}" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{c: df_test[c].sum() for c in cpvs}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "import gensim\n", + "\n", + "processed_docs = df_train[\"title_texte\"].apply(lambda x: x.split(\" \"))\n", + "processed_docs_test = df_train[\"title_texte\"].apply(lambda x: x.split(\" \"))\n", + "\n", + "dictionary = gensim.corpora.Dictionary(processed_docs)\n", + "dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)\n", + "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Find good number of topics" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 14/14 [01:49<00:00, 7.81s/it]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "from gensim.models import CoherenceModel\n", + "\n", + "topic_counts = range(2, 30, 2)\n", + "coherences = []\n", + "\n", + "for topic_count in tqdm(topic_counts, total=len(topic_counts)):\n", + " lda_model = gensim.models.LdaModel(bow_corpus, num_topics=topic_count, id2word=dictionary, passes=2,\n", + " minimum_probability=0.0, random_state=0)\n", + " lda_coherence_model = CoherenceModel(model=lda_model, texts=processed_docs_test, dictionary=dictionary,\n", + " coherence='c_v')\n", + " coherences.append(lda_coherence_model.get_coherence())" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "data": { + "text/plain": "<Figure size 640x480 with 1 Axes>", + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from matplotlib import pyplot as plt\n", + "\n", + "plt.plot(topic_counts, coherences)\n", + "plt.xticks(topic_counts, topic_counts)\n", + "plt.show()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Two local maxima found: at 6 and in range [14; 18].\n", + "Let's take 18 topics." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Find good number of passes" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 12/12 [12:07<00:00, 60.60s/it]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "from gensim.models import CoherenceModel\n", + "\n", + "pass_counts = range(2, 50, 4)\n", + "coherences = []\n", + "\n", + "for pass_count in tqdm(pass_counts, total=len(pass_counts)):\n", + " lda_model = gensim.models.LdaModel(bow_corpus, num_topics=18, id2word=dictionary, passes=pass_count,\n", + " minimum_probability=0.0, random_state=0)\n", + " lda_coherence_model = CoherenceModel(model=lda_model, texts=processed_docs_test, dictionary=dictionary,\n", + " coherence='c_v')\n", + " coherences.append(lda_coherence_model.get_coherence())" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [ + { + "data": { + "text/plain": "<Figure size 640x480 with 1 Axes>", + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(pass_counts, coherences)\n", + "plt.xticks(pass_counts, pass_counts)\n", + "plt.show()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "10 passes seems to be a good balance between training time and performance" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Grid search for alpha and eta hyperparameters" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 13, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/8 [00:00<?, ?it/s]\n", + " 0%| | 0/7 [00:00<?, ?it/s]\u001B[A\n", + " 14%|█▍ | 1/7 [00:21<02:09, 21.60s/it]\u001B[A\n", + " 29%|██▊ | 2/7 [00:45<01:53, 22.67s/it]\u001B[A\n", + " 43%|████▎ | 3/7 [01:10<01:35, 23.82s/it]\u001B[A\n", + " 57%|█████▋ | 4/7 [01:36<01:14, 24.82s/it]\u001B[A\n", + " 71%|███████▏ | 5/7 [02:03<00:51, 25.67s/it]\u001B[A\n", + " 86%|████████▌ | 6/7 [02:27<00:25, 25.04s/it]\u001B[A\n", + "100%|██████████| 7/7 [02:52<00:00, 24.62s/it]\u001B[A\n", + " 12%|█▎ | 1/8 [02:52<20:06, 172.32s/it]\n", + " 0%| | 0/7 [00:00<?, ?it/s]\u001B[A\n", + " 14%|█▍ | 1/7 [00:28<02:49, 28.25s/it]\u001B[A\n", + " 29%|██▊ | 2/7 [00:54<02:15, 27.03s/it]\u001B[A\n", + " 43%|████▎ | 3/7 [01:19<01:44, 26.06s/it]\u001B[A\n", + " 57%|█████▋ | 4/7 [01:44<01:16, 25.64s/it]\u001B[A\n", + " 71%|███████▏ | 5/7 [02:11<00:52, 26.29s/it]\u001B[A\n", + " 86%|████████▌ | 6/7 [02:34<00:25, 25.16s/it]\u001B[A\n", + "100%|██████████| 7/7 [02:57<00:00, 25.35s/it]\u001B[A\n", + " 25%|██▌ | 2/8 [05:49<17:31, 175.32s/it]\n", + " 0%| | 0/7 [00:00<?, ?it/s]\u001B[A\n", + " 14%|█▍ | 1/7 [00:19<01:56, 19.48s/it]\u001B[A\n", + " 29%|██▊ | 2/7 [00:39<01:38, 19.64s/it]\u001B[A\n", + " 43%|████▎ | 3/7 [00:59<01:19, 19.78s/it]\u001B[A\n", + " 57%|█████▋ | 4/7 [01:19<00:59, 19.93s/it]\u001B[A\n", + " 71%|███████▏ | 5/7 [01:40<00:40, 20.45s/it]\u001B[A\n", + " 86%|████████▌ | 6/7 [02:01<00:20, 20.62s/it]\u001B[A\n", + "100%|██████████| 7/7 [02:21<00:00, 20.25s/it]\u001B[A\n", + " 38%|███▊ | 3/8 [08:11<13:20, 160.01s/it]\n", + " 0%| | 0/7 [00:00<?, ?it/s]\u001B[A\n", + " 14%|█▍ | 1/7 [00:17<01:47, 17.89s/it]\u001B[A\n", + " 29%|██▊ | 2/7 [00:35<01:28, 17.71s/it]\u001B[A\n", + " 43%|████▎ | 3/7 [00:53<01:11, 17.77s/it]\u001B[A\n", + " 57%|█████▋ | 4/7 [01:14<00:57, 19.27s/it]\u001B[A\n", + " 71%|███████▏ | 5/7 [01:37<00:41, 20.55s/it]\u001B[A\n", + " 86%|████████▌ | 6/7 [01:59<00:21, 21.09s/it]\u001B[A\n", + "100%|██████████| 7/7 [02:22<00:00, 20.38s/it]\u001B[A\n", + " 50%|█████ | 4/8 [10:34<10:12, 153.17s/it]\n", + " 0%| | 0/7 [00:00<?, ?it/s]\u001B[A\n", + " 14%|█▍ | 1/7 [00:18<01:50, 18.37s/it]\u001B[A\n", + " 29%|██▊ | 2/7 [00:36<01:32, 18.51s/it]\u001B[A\n", + " 43%|████▎ | 3/7 [00:55<01:14, 18.60s/it]\u001B[A\n", + " 57%|█████▋ | 4/7 [01:14<00:56, 18.84s/it]\u001B[A\n", + " 71%|███████▏ | 5/7 [01:34<00:38, 19.25s/it]\u001B[A\n", + " 86%|████████▌ | 6/7 [01:55<00:19, 19.59s/it]\u001B[A\n", + "100%|██████████| 7/7 [02:14<00:00, 19.14s/it]\u001B[A\n", + " 62%|██████▎ | 5/8 [12:48<07:18, 146.26s/it]\n", + " 0%| | 0/7 [00:00<?, ?it/s]\u001B[A\n", + " 14%|█▍ | 1/7 [00:23<02:22, 23.69s/it]\u001B[A\n", + " 29%|██▊ | 2/7 [00:46<01:56, 23.37s/it]\u001B[A\n", + " 43%|████▎ | 3/7 [01:10<01:33, 23.46s/it]\u001B[A\n", + " 57%|█████▋ | 4/7 [01:37<01:14, 24.83s/it]\u001B[A\n", + " 71%|███████▏ | 5/7 [02:02<00:50, 25.05s/it]\u001B[A\n", + " 86%|████████▌ | 6/7 [02:26<00:24, 24.59s/it]\u001B[A\n", + "100%|██████████| 7/7 [02:50<00:00, 24.30s/it]\u001B[A\n", + " 75%|███████▌ | 6/8 [15:38<05:08, 154.36s/it]\n", + " 0%| | 0/7 [00:00<?, ?it/s]\u001B[A\n", + " 14%|█▍ | 1/7 [00:23<02:18, 23.12s/it]\u001B[A\n", + " 29%|██▊ | 2/7 [00:46<01:57, 23.45s/it]\u001B[A\n", + " 43%|████▎ | 3/7 [01:11<01:35, 23.85s/it]\u001B[A\n", + " 57%|█████▋ | 4/7 [01:35<01:12, 24.08s/it]\u001B[A\n", + " 71%|███████▏ | 5/7 [01:59<00:47, 23.96s/it]\u001B[A\n", + " 86%|████████▌ | 6/7 [02:22<00:23, 23.67s/it]\u001B[A\n", + "100%|██████████| 7/7 [02:45<00:00, 23.66s/it]\u001B[A\n", + " 88%|████████▊ | 7/8 [18:23<02:38, 158.04s/it]\n", + " 0%| | 0/7 [00:00<?, ?it/s]\u001B[A\n", + " 14%|█▍ | 1/7 [00:23<02:23, 23.88s/it]\u001B[A\n", + " 29%|██▊ | 2/7 [00:48<02:00, 24.05s/it]\u001B[A\n", + " 43%|████▎ | 3/7 [01:12<01:36, 24.19s/it]\u001B[A\n", + " 57%|█████▋ | 4/7 [01:36<01:12, 24.01s/it]\u001B[A\n", + " 71%|███████▏ | 5/7 [02:00<00:47, 23.97s/it]\u001B[A\n", + " 86%|████████▌ | 6/7 [02:22<00:23, 23.35s/it]\u001B[A\n", + "100%|██████████| 7/7 [02:44<00:00, 23.56s/it]\u001B[A\n", + "100%|██████████| 8/8 [21:08<00:00, 158.61s/it]\n" + ] + } + ], + "source": [ + "from numpy import arange\n", + "from tqdm import tqdm\n", + "from gensim.models import CoherenceModel\n", + "\n", + "topic_count = 18\n", + "pass_count = 10\n", + "alpha_vals = list(arange(0.1, 1., 0.2)) + ['symmetric', 'asymmetric', 'auto']\n", + "eta_vals = list(arange(0.1, 1., 0.2)) + ['symmetric', 'auto']\n", + "\n", + "params = []\n", + "coherences = []\n", + "for alpha in tqdm(alpha_vals, total=len(alpha_vals)):\n", + " for eta in tqdm(eta_vals, total=len(eta_vals)):\n", + " lda_model = gensim.models.LdaModel(bow_corpus, num_topics=topic_count, id2word=dictionary, passes=pass_count,\n", + " minimum_probability=0.0, random_state=0, alpha=alpha, eta=eta)\n", + " lda_coherence_model = CoherenceModel(model=lda_model, texts=processed_docs_test, dictionary=dictionary,\n", + " coherence='c_v')\n", + " coherences.append(lda_coherence_model.get_coherence())\n", + " params.append((alpha, eta))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 19, + "outputs": [ + { + "data": { + "text/plain": " alpha eta coherence\n0 0.1 0.1 0.591410\n1 0.1 0.3 0.590619\n2 0.1 0.5 0.604507\n3 0.1 0.7 0.593771\n4 0.1 0.9 0.588428\n5 0.1 symmetric 0.593764\n6 0.1 auto 0.593764\n7 0.3 0.1 0.545852\n8 0.3 0.3 0.543587\n9 0.3 0.5 0.545267\n10 0.3 0.7 0.547948\n11 0.3 0.9 0.545067\n12 0.3 symmetric 0.542743\n13 0.3 auto 0.542743\n14 0.5 0.1 0.594534\n15 0.5 0.3 0.603920\n16 0.5 0.5 0.605625\n17 0.5 0.7 0.602780\n18 0.5 0.9 0.602691\n19 0.5 symmetric 0.590724\n20 0.5 auto 0.590724\n21 0.7 0.1 0.573548\n22 0.7 0.3 0.571211\n23 0.7 0.5 0.573821\n24 0.7 0.7 0.571404\n25 0.7 0.9 0.575992\n26 0.7 symmetric 0.580720\n27 0.7 auto 0.580720\n28 0.9 0.1 0.598190\n29 0.9 0.3 0.589366\n30 0.9 0.5 0.590532\n31 0.9 0.7 0.593980\n32 0.9 0.9 0.598783\n33 0.9 symmetric 0.596835\n34 0.9 auto 0.596835\n35 symmetric 0.1 0.609470\n36 symmetric 0.3 0.610043\n37 symmetric 0.5 0.618075\n38 symmetric 0.7 0.602709\n39 symmetric 0.9 0.608895\n40 symmetric symmetric 0.606818\n41 symmetric auto 0.606818\n42 asymmetric 0.1 0.603349\n43 asymmetric 0.3 0.605684\n44 asymmetric 0.5 0.613109\n45 asymmetric 0.7 0.612604\n46 asymmetric 0.9 0.612228\n47 asymmetric symmetric 0.602633\n48 asymmetric auto 0.602633\n49 auto 0.1 0.611178\n50 auto 0.3 0.611255\n51 auto 0.5 0.615496\n52 auto 0.7 0.603420\n53 auto 0.9 0.600422\n54 auto symmetric 0.608649\n55 auto auto 0.608649", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>alpha</th>\n <th>eta</th>\n <th>coherence</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.1</td>\n <td>0.1</td>\n <td>0.591410</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.1</td>\n <td>0.3</td>\n <td>0.590619</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.1</td>\n <td>0.5</td>\n <td>0.604507</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.1</td>\n <td>0.7</td>\n <td>0.593771</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.1</td>\n <td>0.9</td>\n <td>0.588428</td>\n </tr>\n <tr>\n <th>5</th>\n <td>0.1</td>\n <td>symmetric</td>\n <td>0.593764</td>\n </tr>\n <tr>\n <th>6</th>\n <td>0.1</td>\n <td>auto</td>\n <td>0.593764</td>\n </tr>\n <tr>\n <th>7</th>\n <td>0.3</td>\n <td>0.1</td>\n <td>0.545852</td>\n </tr>\n <tr>\n <th>8</th>\n <td>0.3</td>\n <td>0.3</td>\n <td>0.543587</td>\n </tr>\n <tr>\n <th>9</th>\n <td>0.3</td>\n <td>0.5</td>\n <td>0.545267</td>\n </tr>\n <tr>\n <th>10</th>\n <td>0.3</td>\n <td>0.7</td>\n <td>0.547948</td>\n </tr>\n <tr>\n <th>11</th>\n <td>0.3</td>\n <td>0.9</td>\n <td>0.545067</td>\n </tr>\n <tr>\n <th>12</th>\n <td>0.3</td>\n <td>symmetric</td>\n <td>0.542743</td>\n </tr>\n <tr>\n <th>13</th>\n <td>0.3</td>\n <td>auto</td>\n <td>0.542743</td>\n </tr>\n <tr>\n <th>14</th>\n <td>0.5</td>\n <td>0.1</td>\n <td>0.594534</td>\n </tr>\n <tr>\n <th>15</th>\n <td>0.5</td>\n <td>0.3</td>\n <td>0.603920</td>\n </tr>\n <tr>\n <th>16</th>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.605625</td>\n </tr>\n <tr>\n <th>17</th>\n <td>0.5</td>\n <td>0.7</td>\n <td>0.602780</td>\n </tr>\n <tr>\n <th>18</th>\n <td>0.5</td>\n <td>0.9</td>\n <td>0.602691</td>\n </tr>\n <tr>\n <th>19</th>\n <td>0.5</td>\n <td>symmetric</td>\n <td>0.590724</td>\n </tr>\n <tr>\n <th>20</th>\n <td>0.5</td>\n <td>auto</td>\n <td>0.590724</td>\n </tr>\n <tr>\n <th>21</th>\n <td>0.7</td>\n <td>0.1</td>\n <td>0.573548</td>\n </tr>\n <tr>\n <th>22</th>\n <td>0.7</td>\n <td>0.3</td>\n <td>0.571211</td>\n </tr>\n <tr>\n <th>23</th>\n <td>0.7</td>\n <td>0.5</td>\n <td>0.573821</td>\n </tr>\n <tr>\n <th>24</th>\n <td>0.7</td>\n <td>0.7</td>\n <td>0.571404</td>\n </tr>\n <tr>\n <th>25</th>\n <td>0.7</td>\n <td>0.9</td>\n <td>0.575992</td>\n </tr>\n <tr>\n <th>26</th>\n <td>0.7</td>\n <td>symmetric</td>\n <td>0.580720</td>\n </tr>\n <tr>\n <th>27</th>\n <td>0.7</td>\n <td>auto</td>\n <td>0.580720</td>\n </tr>\n <tr>\n <th>28</th>\n <td>0.9</td>\n <td>0.1</td>\n <td>0.598190</td>\n </tr>\n <tr>\n <th>29</th>\n <td>0.9</td>\n <td>0.3</td>\n <td>0.589366</td>\n </tr>\n <tr>\n <th>30</th>\n <td>0.9</td>\n <td>0.5</td>\n <td>0.590532</td>\n </tr>\n <tr>\n <th>31</th>\n <td>0.9</td>\n <td>0.7</td>\n <td>0.593980</td>\n </tr>\n <tr>\n <th>32</th>\n <td>0.9</td>\n <td>0.9</td>\n <td>0.598783</td>\n </tr>\n <tr>\n <th>33</th>\n <td>0.9</td>\n <td>symmetric</td>\n <td>0.596835</td>\n </tr>\n <tr>\n <th>34</th>\n <td>0.9</td>\n <td>auto</td>\n <td>0.596835</td>\n </tr>\n <tr>\n <th>35</th>\n <td>symmetric</td>\n <td>0.1</td>\n <td>0.609470</td>\n </tr>\n <tr>\n <th>36</th>\n <td>symmetric</td>\n <td>0.3</td>\n <td>0.610043</td>\n </tr>\n <tr>\n <th>37</th>\n <td>symmetric</td>\n <td>0.5</td>\n <td>0.618075</td>\n </tr>\n <tr>\n <th>38</th>\n <td>symmetric</td>\n <td>0.7</td>\n <td>0.602709</td>\n </tr>\n <tr>\n <th>39</th>\n <td>symmetric</td>\n <td>0.9</td>\n <td>0.608895</td>\n </tr>\n <tr>\n <th>40</th>\n <td>symmetric</td>\n <td>symmetric</td>\n <td>0.606818</td>\n </tr>\n <tr>\n <th>41</th>\n <td>symmetric</td>\n <td>auto</td>\n <td>0.606818</td>\n </tr>\n <tr>\n <th>42</th>\n <td>asymmetric</td>\n <td>0.1</td>\n <td>0.603349</td>\n </tr>\n <tr>\n <th>43</th>\n <td>asymmetric</td>\n <td>0.3</td>\n <td>0.605684</td>\n </tr>\n <tr>\n <th>44</th>\n <td>asymmetric</td>\n <td>0.5</td>\n <td>0.613109</td>\n </tr>\n <tr>\n <th>45</th>\n <td>asymmetric</td>\n <td>0.7</td>\n <td>0.612604</td>\n </tr>\n <tr>\n <th>46</th>\n <td>asymmetric</td>\n <td>0.9</td>\n <td>0.612228</td>\n </tr>\n <tr>\n <th>47</th>\n <td>asymmetric</td>\n <td>symmetric</td>\n <td>0.602633</td>\n </tr>\n <tr>\n <th>48</th>\n <td>asymmetric</td>\n <td>auto</td>\n <td>0.602633</td>\n </tr>\n <tr>\n <th>49</th>\n <td>auto</td>\n <td>0.1</td>\n <td>0.611178</td>\n </tr>\n <tr>\n <th>50</th>\n <td>auto</td>\n <td>0.3</td>\n <td>0.611255</td>\n </tr>\n <tr>\n <th>51</th>\n <td>auto</td>\n <td>0.5</td>\n <td>0.615496</td>\n </tr>\n <tr>\n <th>52</th>\n <td>auto</td>\n <td>0.7</td>\n <td>0.603420</td>\n </tr>\n <tr>\n <th>53</th>\n <td>auto</td>\n <td>0.9</td>\n <td>0.600422</td>\n </tr>\n <tr>\n <th>54</th>\n <td>auto</td>\n <td>symmetric</td>\n <td>0.608649</td>\n </tr>\n <tr>\n <th>55</th>\n <td>auto</td>\n <td>auto</td>\n <td>0.608649</td>\n </tr>\n </tbody>\n</table>\n</div>" + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results = pd.DataFrame(params, columns=[\"alpha\", \"eta\"])\n", + "results['coherence'] = coherences\n", + "results" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 20, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Max coherence of 0.6180751847981156 for alpha=symmetric and eta=0.5000000000000001\n" + ] + } + ], + "source": [ + "max_params = (0, 0)\n", + "max_coherence = 0\n", + "for (alpha, eta), coherence in zip(params, coherences):\n", + " if coherence > max_coherence:\n", + " max_coherence = coherence\n", + " max_params = (alpha, eta)\n", + "print(f\"Max coherence of {max_coherence} for alpha={max_params[0]} and eta={max_params[1]}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/20230328-unsupervised-clustering-lda-classification.ipynb b/20230328-unsupervised-clustering-lda-classification.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8663fcb231698d3b7ac007c60ac747ac22d82a74 --- /dev/null +++ b/20230328-unsupervised-clustering-lda-classification.ipynb @@ -0,0 +1,618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Unsupervised clustering - evaluation of good model\n", + "\n", + "## Dataset loading" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 1, + "outputs": [ + { + "data": { + "text/plain": " title_texte 85 44 50 \\\n1 ipa supply equipment increase competitiveness... False True False \n3 provision language training service tender in... False False False \n4 service support eda helicopter portfolio main... False False False \n5 NUMBER cp op NUMBER pooling share cost non co... False False False \n6 edf supply transport household similar waste ... False False False \n\n 80 73 45 71 79 90 ... 18 03 24 43 \\\n1 False False False False False False ... False False False False \n3 True False False False False False ... False False False False \n4 True False False False False False ... False False False False \n5 False True False False False False ... False False False False \n6 False False True False False False ... False False False False \n\n 19 41 37 14 16 76 \n1 False False False False False False \n3 False False False False False False \n4 False False False False False False \n5 False False False False False False \n6 False False False False False False \n\n[5 rows x 46 columns]", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>title_texte</th>\n <th>85</th>\n <th>44</th>\n <th>50</th>\n <th>80</th>\n <th>73</th>\n <th>45</th>\n <th>71</th>\n <th>79</th>\n <th>90</th>\n <th>...</th>\n <th>18</th>\n <th>03</th>\n <th>24</th>\n <th>43</th>\n <th>19</th>\n <th>41</th>\n <th>37</th>\n <th>14</th>\n <th>16</th>\n <th>76</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1</th>\n <td>ipa supply equipment increase competitiveness...</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>3</th>\n <td>provision language training service tender in...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>4</th>\n <td>service support eda helicopter portfolio main...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>5</th>\n <td>NUMBER cp op NUMBER pooling share cost non co...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n <tr>\n <th>6</th>\n <td>edf supply transport household similar waste ...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>True</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>...</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n <td>False</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 46 columns</p>\n</div>" + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"20230214-dataset_preprocessed_with_lemma.csv\", index_col=0)\n", + "df.head()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "data": { + "text/plain": "((11647, 46), (2912, 46))" + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "cpvs = [c for c in df.columns if len(c) == 2]\n", + "df_train, df_test = train_test_split(df, test_size=0.2, shuffle=False)\n", + "(df_train.shape, df_test.shape)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": "{'85': 256,\n '44': 103,\n '50': 297,\n '80': 403,\n '73': 1067,\n '45': 731,\n '71': 1621,\n '79': 2682,\n '90': 629,\n '30': 266,\n '35': 145,\n '33': 158,\n '55': 117,\n '72': 914,\n '48': 199,\n '38': 289,\n '09': 128,\n '75': 277,\n '66': 206,\n '64': 148,\n '42': 159,\n '34': 199,\n '60': 122,\n '92': 169,\n '39': 188,\n '31': 139,\n '98': 123,\n '51': 50,\n '32': 185,\n '65': 29,\n '77': 83,\n '22': 61,\n '63': 144,\n '15': 43,\n '70': 44,\n '18': 35,\n '03': 31,\n '24': 30,\n '43': 17,\n '19': 7,\n '41': 13,\n '37': 13,\n '14': 16,\n '16': 5,\n '76': 5}" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{c: df_train[c].sum() for c in cpvs}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "{'85': 62,\n '44': 23,\n '50': 68,\n '80': 99,\n '73': 249,\n '45': 191,\n '71': 404,\n '79': 688,\n '90': 176,\n '30': 70,\n '35': 36,\n '33': 38,\n '55': 30,\n '72': 197,\n '48': 43,\n '38': 61,\n '09': 34,\n '75': 77,\n '66': 46,\n '64': 37,\n '42': 27,\n '34': 50,\n '60': 41,\n '92': 43,\n '39': 49,\n '31': 36,\n '98': 35,\n '51': 8,\n '32': 40,\n '65': 15,\n '77': 22,\n '22': 20,\n '63': 34,\n '15': 5,\n '70': 14,\n '18': 5,\n '03': 5,\n '24': 11,\n '43': 1,\n '19': 5,\n '41': 2,\n '37': 3,\n '14': 6,\n '16': 4,\n '76': 3}" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{c: df_test[c].sum() for c in cpvs}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "import gensim\n", + "\n", + "processed_docs = df_train[\"title_texte\"].apply(lambda x: x.split(\" \"))\n", + "processed_docs_test = df_train[\"title_texte\"].apply(lambda x: x.split(\" \"))\n", + "\n", + "dictionary = gensim.corpora.Dictionary(processed_docs)\n", + "dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)\n", + "bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Model creation" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [], + "source": [ + "lda_model = gensim.models.LdaModel(bow_corpus, num_topics=18, id2word=dictionary, passes=10,\n", + " minimum_probability=0.0, random_state=0, alpha='symmetric', eta=0.5)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Evaluation of CPV classification" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 14, + "outputs": [], + "source": [ + "cpv_labels = {\n", + " \"03\": \"Agricultural, farming, fishing, forestry and related products\",\n", + " \"09\": \"Petroleum products, fuel, electricity and other sources of energy\",\n", + " \"14\": \"Mining, basic metals and related products\",\n", + " \"15\": \"Food, beverages, tobacco and related products\",\n", + " \"16\": \"Agricultural machinery\",\n", + " \"18\": \"Clothing, footwear, luggage articles and accessories\",\n", + " \"19\": \"Leather and textile fabrics, plastic and rubber materials\",\n", + " \"22\": \"Printed matter and related products\",\n", + " \"24\": \"Chemical products\",\n", + " \"30\": \"Office and computing machinery, equipment and supplies except furniture and software packages\",\n", + " \"31\": \"Electrical machinery, apparatus, equipment and consumables; Lighting\",\n", + " \"32\": \"Radio, television, communication, telecommunication and related equipment\",\n", + " \"33\": \"Medical equipments, pharmaceuticals and personal care products\",\n", + " \"34\": \"Transport equipment and auxiliary products to transportation\",\n", + " \"35\": \"Security, fire-fighting, police and defence equipment\",\n", + " \"37\": \"Musical instruments, sport goods, games, toys, handicraft, art materials and accessories\",\n", + " \"38\": \"Laboratory, optical and precision equipments (excl. glasses)\",\n", + " \"39\": \"Furniture (incl. office furniture), furnishings, domestic appliances (excl. lighting) and cleaning products\",\n", + " \"41\": \"Collected and purified water\",\n", + " \"42\": \"Industrial machinery\",\n", + " \"43\": \"Machinery for mining, quarrying, construction equipment\",\n", + " \"44\": \"Construction structures and materials; auxiliary products to construction (excepts electric apparatus)\",\n", + " \"45\": \"Construction work\",\n", + " \"48\": \"Software package and information systems\",\n", + " \"50\": \"Repair and maintenance services\",\n", + " \"51\": \"Installation services (except software)\",\n", + " \"55\": \"Hotel, restaurant and retail trade services\",\n", + " \"60\": \"Transport services (excl. Waste transport)\",\n", + " \"63\": \"Supporting and auxiliary transport services; travel agencies services\",\n", + " \"64\": \"Postal and telecommunications services\",\n", + " \"65\": \"Public utilities\",\n", + " \"66\": \"Financial and insurance services\",\n", + " \"70\": \"Real estate services\",\n", + " \"71\": \"Architectural, construction, engineering and inspection services\",\n", + " \"72\": \"IT services: consulting, software development, Internet and support\",\n", + " \"73\": \"Research and development services and related consultancy services\",\n", + " \"75\": \"Administration, defence and social security services\",\n", + " \"76\": \"Services related to the oil and gas industry\",\n", + " \"77\": \"Agricultural, forestry, horticultural, aquacultural and apicultural services\",\n", + " \"79\": \"Business services: law, marketing, consulting, recruitment, printing and security\",\n", + " \"80\": \"Education and training services\",\n", + " \"85\": \"Health and social work services\",\n", + " \"90\": \"Sewage-, refuse-, cleaning-, and environmental services\",\n", + " \"92\": \"Recreational, cultural and sporting services\",\n", + " \"98\": \"Other community, social and personal services\",\n", + "}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [], + "source": [ + "from scipy.stats import entropy\n", + "import numpy as np\n", + "\n", + "\n", + "# https://github.com/soberbichler/Using-LDA-and-Jensen-Shannon-distance-to-separate-relevant-from-non-relevant-articles/blob/master/news_article_similarity_remigration_notebook.ipynb\n", + "def jensen_shannon(query, matrix):\n", + " p = query[None, :].T\n", + " q = matrix.T\n", + " m = 0.5 * (p + q)\n", + " return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))\n", + "\n", + "\n", + "def get_doc_similarities(query, matrix):\n", + " sims = jensen_shannon(query, matrix) # list of jensen shannon distances\n", + " return sims" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [], + "source": [ + "corpus = df_train[\"title_texte\"].apply(lambda x: dictionary.doc2bow(x.split(\" \")))\n", + "all_dists = np.stack([np.array([tup[1] for tup in lst]) for lst in lda_model[list(corpus)]])" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Take the 3 closest notices" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 15, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/2912 [00:00<?, ?it/s]/tmp/ipykernel_30175/2252406494.py:10: RuntimeWarning: invalid value encountered in sqrt\n", + " return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))\n", + "100%|██████████| 2912/2912 [00:30<00:00, 95.58it/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Health and social work services... | 85 0.02 0.06 0.04 62\n", + "Construction structures and materials; a... | 44 0.00 0.00 0.00 23\n", + " Repair and maintenance services... | 50 0.02 0.07 0.03 68\n", + " Education and training services... | 80 0.03 0.07 0.04 99\n", + "Research and development services and re... | 73 0.08 0.22 0.12 249\n", + " Construction work... | 45 0.08 0.19 0.11 191\n", + "Architectural, construction, engineering... | 71 0.14 0.35 0.20 404\n", + "Business services: law, marketing, consu... | 79 0.24 0.50 0.32 688\n", + "Sewage-, refuse-, cleaning-, and environ... | 90 0.05 0.12 0.07 176\n", + "Office and computing machinery, equipmen... | 30 0.01 0.01 0.01 70\n", + "Security, fire-fighting, police and defe... | 35 0.03 0.08 0.05 36\n", + "Medical equipments, pharmaceuticals and ... | 33 0.02 0.05 0.03 38\n", + "Hotel, restaurant and retail trade servi... | 55 0.00 0.00 0.00 30\n", + "IT services: consulting, software develo... | 72 0.06 0.18 0.09 197\n", + "Software package and information systems... | 48 0.02 0.07 0.03 43\n", + "Laboratory, optical and precision equipm... | 38 0.03 0.08 0.04 61\n", + "Petroleum products, fuel, electricity an... | 09 0.01 0.03 0.02 34\n", + "Administration, defence and social secur... | 75 0.05 0.10 0.06 77\n", + " Financial and insurance services... | 66 0.00 0.00 0.00 46\n", + " Postal and telecommunications services... | 64 0.00 0.00 0.00 37\n", + " Industrial machinery... | 42 0.00 0.00 0.00 27\n", + "Transport equipment and auxiliary produc... | 34 0.01 0.04 0.02 50\n", + "Transport services (excl. Waste transpor... | 60 0.07 0.12 0.09 41\n", + "Recreational, cultural and sporting serv... | 92 0.01 0.02 0.01 43\n", + "Furniture (incl. office furniture), furn... | 39 0.00 0.00 0.00 49\n", + "Electrical machinery, apparatus, equipme... | 31 0.02 0.06 0.03 36\n", + "Other community, social and personal ser... | 98 0.05 0.14 0.08 35\n", + " Installation services (except software)... | 51 0.00 0.00 0.00 8\n", + "Radio, television, communication, teleco... | 32 0.01 0.03 0.01 40\n", + " Public utilities... | 65 0.05 0.07 0.06 15\n", + "Agricultural, forestry, horticultural, a... | 77 0.00 0.00 0.00 22\n", + " Printed matter and related products... | 22 0.00 0.00 0.00 20\n", + "Supporting and auxiliary transport servi... | 63 0.00 0.00 0.00 34\n", + "Food, beverages, tobacco and related pro... | 15 0.00 0.00 0.00 5\n", + " Real estate services... | 70 0.00 0.00 0.00 14\n", + "Clothing, footwear, luggage articles and... | 18 0.00 0.00 0.00 5\n", + "Agricultural, farming, fishing, forestry... | 03 0.00 0.00 0.00 5\n", + " Chemical products... | 24 0.00 0.00 0.00 11\n", + "Machinery for mining, quarrying, constru... | 43 0.00 0.00 0.00 1\n", + "Leather and textile fabrics, plastic and... | 19 0.00 0.00 0.00 5\n", + " Collected and purified water... | 41 0.00 0.00 0.00 2\n", + "Musical instruments, sport goods, games,... | 37 0.00 0.00 0.00 3\n", + "Mining, basic metals and related product... | 14 0.00 0.00 0.00 6\n", + " Agricultural machinery... | 16 0.00 0.00 0.00 4\n", + "Services related to the oil and gas indu... | 76 0.00 0.00 0.00 3\n", + "\n", + " micro avg 0.09 0.22 0.12 3113\n", + " macro avg 0.02 0.06 0.03 3113\n", + " weighted avg 0.10 0.22 0.13 3113\n", + " samples avg 0.09 0.23 0.13 3113\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "/home/ferreni/Projects/TED AI/Repositories/tedai-cpv-classification/venv/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "from sklearn.metrics import classification_report\n", + "\n", + "y_true = []\n", + "y_pred = []\n", + "notices = list(df_test.iloc)\n", + "for notice in tqdm(notices, total=len(notices)):\n", + " notice_bow = dictionary.doc2bow(notice[\"title_texte\"].split(\" \"))\n", + " dist = np.array([tup[1] for tup in lda_model.get_document_topics(bow=notice_bow)])\n", + " sims = get_doc_similarities(dist, all_dists)\n", + " most_sim_ids = sims.argsort()[:3] # the top k positional index of the smallest Jensen Shannon distances\n", + " most_similar_df = df_train[df_train.index.isin(most_sim_ids)]\n", + " y_true.append([int(notice[c] == True) for c in cpvs])\n", + " y_pred.append([int(most_similar_df[c].sum() > 0) for c in cpvs])\n", + "print(classification_report(y_true, y_pred, target_names=[f\"{cpv_labels[c][:40]}... | {c}\" for c in cpvs]))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Take the 10 closest notices" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 16, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/2912 [00:00<?, ?it/s]/tmp/ipykernel_30175/2252406494.py:10: RuntimeWarning: invalid value encountered in sqrt\n", + " return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))\n", + "100%|██████████| 2912/2912 [00:27<00:00, 104.65it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Health and social work services... | 85 0.03 0.26 0.05 62\n", + "Construction structures and materials; a... | 44 0.00 0.04 0.01 23\n", + " Repair and maintenance services... | 50 0.02 0.21 0.04 68\n", + " Education and training services... | 80 0.04 0.32 0.07 99\n", + "Research and development services and re... | 73 0.08 0.54 0.14 249\n", + " Construction work... | 45 0.07 0.50 0.13 191\n", + "Architectural, construction, engineering... | 71 0.15 0.78 0.25 404\n", + "Business services: law, marketing, consu... | 79 0.24 0.91 0.37 688\n", + "Sewage-, refuse-, cleaning-, and environ... | 90 0.07 0.46 0.12 176\n", + "Office and computing machinery, equipmen... | 30 0.02 0.13 0.03 70\n", + "Security, fire-fighting, police and defe... | 35 0.03 0.19 0.05 36\n", + "Medical equipments, pharmaceuticals and ... | 33 0.03 0.21 0.05 38\n", + "Hotel, restaurant and retail trade servi... | 55 0.00 0.00 0.00 30\n", + "IT services: consulting, software develo... | 72 0.06 0.46 0.11 197\n", + "Software package and information systems... | 48 0.01 0.09 0.02 43\n", + "Laboratory, optical and precision equipm... | 38 0.03 0.30 0.06 61\n", + "Petroleum products, fuel, electricity an... | 09 0.00 0.03 0.01 34\n", + "Administration, defence and social secur... | 75 0.02 0.17 0.04 77\n", + " Financial and insurance services... | 66 0.01 0.09 0.02 46\n", + " Postal and telecommunications services... | 64 0.01 0.05 0.01 37\n", + " Industrial machinery... | 42 0.01 0.19 0.03 27\n", + "Transport equipment and auxiliary produc... | 34 0.02 0.16 0.03 50\n", + "Transport services (excl. Waste transpor... | 60 0.03 0.17 0.05 41\n", + "Recreational, cultural and sporting serv... | 92 0.01 0.07 0.01 43\n", + "Furniture (incl. office furniture), furn... | 39 0.01 0.10 0.02 49\n", + "Electrical machinery, apparatus, equipme... | 31 0.01 0.11 0.02 36\n", + "Other community, social and personal ser... | 98 0.03 0.23 0.05 35\n", + " Installation services (except software)... | 51 0.00 0.00 0.00 8\n", + "Radio, television, communication, teleco... | 32 0.01 0.10 0.02 40\n", + " Public utilities... | 65 0.02 0.07 0.03 15\n", + "Agricultural, forestry, horticultural, a... | 77 0.01 0.05 0.01 22\n", + " Printed matter and related products... | 22 0.02 0.10 0.03 20\n", + "Supporting and auxiliary transport servi... | 63 0.02 0.15 0.03 34\n", + "Food, beverages, tobacco and related pro... | 15 0.00 0.00 0.00 5\n", + " Real estate services... | 70 0.00 0.00 0.00 14\n", + "Clothing, footwear, luggage articles and... | 18 0.00 0.00 0.00 5\n", + "Agricultural, farming, fishing, forestry... | 03 0.00 0.00 0.00 5\n", + " Chemical products... | 24 0.00 0.00 0.00 11\n", + "Machinery for mining, quarrying, constru... | 43 0.00 0.00 0.00 1\n", + "Leather and textile fabrics, plastic and... | 19 0.00 0.00 0.00 5\n", + " Collected and purified water... | 41 0.00 0.00 0.00 2\n", + "Musical instruments, sport goods, games,... | 37 0.00 0.00 0.00 3\n", + "Mining, basic metals and related product... | 14 0.00 0.00 0.00 6\n", + " Agricultural machinery... | 16 0.00 0.00 0.00 4\n", + "Services related to the oil and gas indu... | 76 0.00 0.00 0.00 3\n", + "\n", + " micro avg 0.07 0.49 0.13 3113\n", + " macro avg 0.02 0.16 0.04 3113\n", + " weighted avg 0.10 0.49 0.16 3113\n", + " samples avg 0.08 0.50 0.13 3113\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "from sklearn.metrics import classification_report\n", + "\n", + "y_true = []\n", + "y_pred = []\n", + "notices = list(df_test.iloc)\n", + "for notice in tqdm(notices, total=len(notices)):\n", + " notice_bow = dictionary.doc2bow(notice[\"title_texte\"].split(\" \"))\n", + " dist = np.array([tup[1] for tup in lda_model.get_document_topics(bow=notice_bow)])\n", + " sims = get_doc_similarities(dist, all_dists)\n", + " most_sim_ids = sims.argsort()[:10] # the top k positional index of the smallest Jensen Shannon distances\n", + " most_similar_df = df_train[df_train.index.isin(most_sim_ids)]\n", + " y_true.append([int(notice[c] == True) for c in cpvs])\n", + " y_pred.append([int(most_similar_df[c].sum() > 0) for c in cpvs])\n", + "print(classification_report(y_true, y_pred, target_names=[f\"{cpv_labels[c][:40]}... | {c}\" for c in cpvs]))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Take the 10 closest notices (infer on training set)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/11647 [00:00<?, ?it/s]/tmp/ipykernel_30175/2252406494.py:10: RuntimeWarning: invalid value encountered in sqrt\n", + " return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))\n", + "100%|██████████| 11647/11647 [01:52<00:00, 103.23it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Health and social work services... | 85 0.02 0.20 0.04 256\n", + "Construction structures and materials; a... | 44 0.00 0.05 0.01 103\n", + " Repair and maintenance services... | 50 0.03 0.23 0.05 297\n", + " Education and training services... | 80 0.04 0.33 0.07 403\n", + "Research and development services and re... | 73 0.08 0.52 0.14 1067\n", + " Construction work... | 45 0.07 0.49 0.12 731\n", + "Architectural, construction, engineering... | 71 0.14 0.73 0.23 1621\n", + "Business services: law, marketing, consu... | 79 0.23 0.91 0.37 2682\n", + "Sewage-, refuse-, cleaning-, and environ... | 90 0.06 0.46 0.11 629\n", + "Office and computing machinery, equipmen... | 30 0.02 0.20 0.04 266\n", + "Security, fire-fighting, police and defe... | 35 0.01 0.09 0.02 145\n", + "Medical equipments, pharmaceuticals and ... | 33 0.02 0.17 0.04 158\n", + "Hotel, restaurant and retail trade servi... | 55 0.00 0.03 0.01 117\n", + "IT services: consulting, software develo... | 72 0.08 0.51 0.13 914\n", + "Software package and information systems... | 48 0.02 0.13 0.03 199\n", + "Laboratory, optical and precision equipm... | 38 0.03 0.23 0.05 289\n", + "Petroleum products, fuel, electricity an... | 09 0.01 0.08 0.02 128\n", + "Administration, defence and social secur... | 75 0.02 0.15 0.03 277\n", + " Financial and insurance services... | 66 0.01 0.12 0.03 206\n", + " Postal and telecommunications services... | 64 0.01 0.10 0.02 148\n", + " Industrial machinery... | 42 0.02 0.14 0.03 159\n", + "Transport equipment and auxiliary produc... | 34 0.02 0.15 0.03 199\n", + "Transport services (excl. Waste transpor... | 60 0.01 0.11 0.02 122\n", + "Recreational, cultural and sporting serv... | 92 0.01 0.07 0.02 169\n", + "Furniture (incl. office furniture), furn... | 39 0.02 0.13 0.03 188\n", + "Electrical machinery, apparatus, equipme... | 31 0.01 0.09 0.02 139\n", + "Other community, social and personal ser... | 98 0.01 0.11 0.02 123\n", + " Installation services (except software)... | 51 0.00 0.00 0.00 50\n", + "Radio, television, communication, teleco... | 32 0.01 0.12 0.02 185\n", + " Public utilities... | 65 0.00 0.03 0.01 29\n", + "Agricultural, forestry, horticultural, a... | 77 0.01 0.06 0.01 83\n", + " Printed matter and related products... | 22 0.00 0.03 0.01 61\n", + "Supporting and auxiliary transport servi... | 63 0.01 0.06 0.01 144\n", + "Food, beverages, tobacco and related pro... | 15 0.00 0.00 0.00 43\n", + " Real estate services... | 70 0.01 0.05 0.01 44\n", + "Clothing, footwear, luggage articles and... | 18 0.00 0.03 0.01 35\n", + "Agricultural, farming, fishing, forestry... | 03 0.00 0.00 0.00 31\n", + " Chemical products... | 24 0.00 0.00 0.00 30\n", + "Machinery for mining, quarrying, constru... | 43 0.00 0.00 0.00 17\n", + "Leather and textile fabrics, plastic and... | 19 0.00 0.00 0.00 7\n", + " Collected and purified water... | 41 0.00 0.00 0.00 13\n", + "Musical instruments, sport goods, games,... | 37 0.00 0.00 0.00 13\n", + "Mining, basic metals and related product... | 14 0.00 0.00 0.00 16\n", + " Agricultural machinery... | 16 0.00 0.00 0.00 5\n", + "Services related to the oil and gas indu... | 76 0.00 0.00 0.00 5\n", + "\n", + " micro avg 0.07 0.48 0.12 12546\n", + " macro avg 0.02 0.15 0.04 12546\n", + " weighted avg 0.09 0.48 0.15 12546\n", + " samples avg 0.07 0.49 0.13 12546\n", + "\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "from sklearn.metrics import classification_report\n", + "\n", + "y_true = []\n", + "y_pred = []\n", + "notices = list(df_train.iloc)\n", + "for notice in tqdm(notices, total=len(notices)):\n", + " notice_bow = dictionary.doc2bow(notice[\"title_texte\"].split(\" \"))\n", + " dist = np.array([tup[1] for tup in lda_model.get_document_topics(bow=notice_bow)])\n", + " sims = get_doc_similarities(dist, all_dists)\n", + " most_sim_ids = sims.argsort()[:10] # the top k positional index of the smallest Jensen Shannon distances\n", + " most_similar_df = df_train[df_train.index.isin(most_sim_ids)]\n", + " y_true.append([int(notice[c] == True) for c in cpvs])\n", + " y_pred.append([int(most_similar_df[c].sum() > 0) for c in cpvs])\n", + "print(classification_report(y_true, y_pred, target_names=[f\"{cpv_labels[c][:40]}... | {c}\" for c in cpvs]))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Even on training set, performance remains low" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "More globally, it seems that increasing coherence of the model does not significantly increase CPV classification performance" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}