Hotfix duplicated names.

Also fixing the StrEnum handling for python <=3.10.

Hotfix duplicated names.
5aaf7e05 · Enrico UBALDI · dcbd6b7f · 5aaf7e05 · 5aaf7e05 · 5aaf7e05
Verified Commit 5aaf7e05 authored 7 months ago by Enrico UBALDI
--- a/README.md
+++ b/README.md
@@ -21,6 +21,13 @@ DSA Transparency database tools

 A set of tools to work with daily or total dumps coming from the [DSA Transparency Database](https://transparency.dsa.ec.europa.eu/).

+> **_WARNING_** Due to some backend issues, there currently are duplicated platform names (no VLOPs have been affected).
+> This is being addressed, but in the meantime:
+> - We temporarily loosen up the validation to allow for duplicated platform names -> the code is raising a warning but not an error.
+> - You are not affected if:
+>  - You are working with the `global` dataset, as it does not include platform names.
+>  - You are working with platform-specific datasets that are not duplicated.
+> Please visit the DSA Transparency Database [download page](https://transparency.dsa.ec.europa.eu/data-download) and check the select platform dropdown menu to verify if your platform is affected.

 ## Requirements


--- a/dsa_tdb/__init__.py
+++ b/dsa_tdb/__init__.py
@@ -17,13 +17,13 @@
 #
 # If not, see < https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12 >.#
 __license__ = "EUPLv1.2"
-__version__ = '0.3.7'
+__version__ = "0.3.8"
 __maintainer__ = "Enrico Ubaldi"
 __email__ = "enrico.ubaldi@ec.europa.eu"
 __homepage__ = "https://code.europa.eu/dsa/transparency-database/dsa-tdb"
 __status__ = "Development"

-__doc__ = '''
+__doc__ = """
 The `dsa_tdb` module documentation.

 The `dsa_tdb` module provides a set of tools to interact with the DSA Transparency Database (TDB) data.
@@ -31,16 +31,18 @@ It provides a set of classes and functions to fetch, extract, transform, filter

 It intrnally uses `pyspark` to handle the data at scale even on regular computers and can be easily introduced
 in pipelines using `pandas` or other data manipulation libraries.
-'''
+"""

 import logging
-logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',
-                    datefmt='%Y-%m-%d,%H:%M:%S')
+
+logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s", datefmt="%Y-%m-%d,%H:%M:%S")
 logging.getLogger(__name__).addHandler(logging.NullHandler())

-import dsa_tdb.cli as cli
-import dsa_tdb.types as types
-import dsa_tdb.etl as etl
-import dsa_tdb.fetch as fetch
-import dsa_tdb.utils as utils
-from dsa_tdb.core import TDB_DataFrame
+import dsa_tdb.cli as cli  # noqa: E402
+import dsa_tdb.etl as etl  # noqa: E402
+import dsa_tdb.fetch as fetch  # noqa: E402
+import dsa_tdb.types as types  # noqa: E402
+import dsa_tdb.utils as utils  # noqa: E402
+from dsa_tdb.core import TDB_DataFrame as TDB_DataFrame  # noqa: E402
+
+__all__ = ["cli", "etl", "fetch", "types", "utils", "TDB_DataFrame"]
--- a/dsa_tdb/cli.py
+++ b/dsa_tdb/cli.py
--- a/dsa_tdb/core.py
+++ b/dsa_tdb/core.py
--- a/dsa_tdb/etl.py
+++ b/dsa_tdb/etl.py
--- a/dsa_tdb/fetch.py
+++ b/dsa_tdb/fetch.py
--- a/dsa_tdb/types.py
+++ b/dsa_tdb/types.py
--- a/dsa_tdb/utils.py
+++ b/dsa_tdb/utils.py
--- a/poetry.lock
+++ b/poetry.lock
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.

 [[package]]
 name = "alabaster"
@@ -4171,6 +4171,22 @@ anyio = ">=3.4.0,<5"
 [package.extras]
 full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"]

+[[package]]
+name = "strenum"
+version = "0.4.15"
+description = "An Enum that inherits from str."
+optional = false
+python-versions = "*"
+files = [
+    {file = "StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659"},
+    {file = "StrEnum-0.4.15.tar.gz", hash = "sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff"},
+]
+
+[package.extras]
+docs = ["myst-parser[linkify]", "sphinx", "sphinx-rtd-theme"]
+release = ["twine"]
+test = ["pylint", "pytest", "pytest-black", "pytest-cov", "pytest-pylint"]
+
 [[package]]
 name = "tabulate"
 version = "0.9.0"
@@ -4545,4 +4561,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10.0,<3.13.0"
-content-hash = "a0026ffb58f9a91df42933d8e4c69a8fe13698df433979e78912bddec3adc095"
+content-hash = "7ebd78091fb4a2bde82e51e3a59355bd099af99fd0dcdb986ed4427251b78e8b"
--- a/pyproject.toml
+++ b/pyproject.toml
 [tool.poetry]
 name = "dsa-tdb"
-version = "0.3.7"
+version = "0.3.8"
 description = "The tools and code to manage the daily dumps of the TDB."
 authors = ["DSA CNECT F2"]
 license = "EUPLv1.2"
@@ -20,6 +20,7 @@ numpy = "1.26.1"
 pyspark = "^3.5.3"
 pyyaml = "^6.0.2"
 psutil = "^6.0.0"
+strenum = "^0.4.15"

 [tool.poetry.scripts]
 dsa-tdb-cli = 'dsa_tdb.cli:main'

--- a/scripts/daily_routine.sh
+++ b/scripts/daily_routine.sh
@@ -19,7 +19,7 @@
 ## If not, see < https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12 >.##
 set -e

-# Usage: bash ./scripts/init_routine.sh --platform global --version full --numprocs 12 --root_dir data/daily_dumps_test --start 2023-09-01 --end 2023-09-30
+# Usage: bash ./scripts/init_routine.sh --platform global --version full --numprocs 12 --root_dir data/tdb_data --start 2023-09-01 --end 2023-09-30
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
    case "$1" in
@@ -62,23 +62,24 @@ done
 platform=${platform:-global}
 version=${version:-full}
 numprocs=${numprocs:-$(($(nproc) < 12 ? $(nproc) : 12))}
-root_dir=${root_dir:-data/daily_dumps_test}
+root_dir=${root_dir:-data/tdb_data}
 start=${start:-2023-09-01}
 end=${end:-$(date +%Y-%m-%d)}
 # whether or not to delete the files
 delete_chunks=${delete_chunks:-false}
 # ===

+platform=$(python -c "import dsa_tdb as t; print(t.utils.sanitize_platform_name('${platform}'))")
+
 basedir="${root_dir}/${platform}___${version}"
 # derived from data directory structure
-outFile="${basedir}/daily_dumps_chunked/"
 logFile="${basedir}/log"

 # Ensure log folder exists
 mkdir -p $(dirname $logFile)

 # Concat the string to make the output file name
-outFileAgg="${basedir}/aggregated-${platform}-${version}-partitioned.{}"
+outFileAgg="$basedir/aggregated-$platform-$version.{}"

 # Create a log file to append everything to it
 echo "[$(date +'%Y-%m-%dT%H:%M:%S%:z')] STARTING IMPORT PROCESS" | tee -a "${logFile}"
@@ -109,14 +110,14 @@ while (( $(date -d "${start}" +%s) <= $(date -d "${end}" +%s) )); do
            -o $outFileAgg \
            -i "${start}" \
            -f "${current_end}" \
-            -c aggregation_config.yaml \
+            -c config_aggregation.yaml \
            -n 4 2>&1 | tee -a "${logFile}"
    toc=$(date +%s)
    echo "[$(date +'%Y-%m-%dT%H:%M:%S%:z')] Aggregation took $((toc - tic)) seconds" | tee -a "${logFile}"

    if $delete_chunks; then
        # Delete chunk files (keep folder and COMPLETE file to avoid downloading the files once again at next routine run)
-        rm -r ${outFile}/*/*.parquet
+        rm -r ${basedir}/daily_dumps_chunked/*/*.parquet
    fi

    start=$(date -d "${current_end} +1day" +%Y-%m-%d)

--- a/scripts/download_platform.sh
+++ b/scripts/download_platform.sh
--- a/tests/test_etl.py
+++ b/tests/test_etl.py
@@ -16,14 +16,10 @@
 # along with this program.
 #
 # If not, see < https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12 >.#
-from ast import parse
-import pytest
 import pandas as pd

-from dsa_tdb.etl import loadFile
-from dsa_tdb.types import TDB_datetimeColumns, TDB_columnsFull, datetime_format
+from dsa_tdb.types import TDB_columnsFull, TDB_datetimeColumns, datetime_format

-from pyspark.sql import SparkSession

 def test_loadFile():
    # Test case 1: Loading a file with corrupted dates raises an error
@@ -35,11 +31,13 @@ def test_loadFile():
    #                                TDB_datetimeColumns.content_date])

    # Test case 2: Loading a file should load correct dates
-    tmp_dtype = {k: 'str' for k in TDB_columnsFull._member_names_
-                  if k not in TDB_datetimeColumns._member_names_}
-    assert pd.read_csv('tests/test_data/test_sor_date_parse.csv',
+    tmp_dtype = {k: "str" for k in TDB_columnsFull._member_names_ if k not in TDB_datetimeColumns._member_names_}
+    assert (
+        pd.read_csv(
+            "tests/test_data/test_sor_date_parse.csv",
            parse_dates=TDB_datetimeColumns._member_names_,
            dtype=tmp_dtype,
            date_format=datetime_format,
-                )[TDB_datetimeColumns.content_date].dt.day.iloc[0] > 0
-    
+        )[TDB_datetimeColumns.content_date].dt.day.iloc[0]
+        > 0
+    )
--- a/tests/test_types.py
+++ b/tests/test_types.py
+
+# This file is part of dsa_tdb (see https://code.europa.eu/dsa/transparency-database/dsa-tdb).
+#
+# SPDX-License-Identifier: EUPLv1.2
+# Copyright (C) 2024 European Union
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the EUROPEAN UNION PUBLIC LICENCE v. 1.2 as
+# published by the European Union. 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# EUROPEAN UNION PUBLIC LICENCE v. 1.2 for further details.
+#
+# You should have received a copy of the EUROPEAN UNION PUBLIC LICENCE v. 1.2.
+# along with this program.
+#
+# If not, see < https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12 >.#
+import pandas as pd
+
+import dsa_tdb.types as T
+
+def test_loadFile():
+    # Test case 1: Testing that all the str enums work as expected
+    
+    assert T.ALL_PLATFORMS_ENTRY_VALUE == 'All Platforms'
+    assert T.TDB_dailyDumpsVersion.light == 'light'
+    assert 'parquet' in T.TDB_chunkFormat._member_names_
+    assert 'uuid' in list(T.TDB_columnsFull._member_names_)
+    assert 'created_at' in [T.TDB_datetimeColumns[c]
+                            for c in T.TDB_datetimeColumns._member_names_]
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -17,41 +17,43 @@
 #
 # If not, see < https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12 >.#
 import pytest
-from dsa_tdb.utils import vals2arra, territories2label
+
+from dsa_tdb.utils import territories2label, vals2arra
+

 def test_vals2arra():
    # Test case 1: Empty string
-    assert vals2arra('') == []
+    assert vals2arra("") == []

    # Test case 2: Single value
-    assert vals2arra('AAA') == ['AAA']
+    assert vals2arra("AAA") == ["AAA"]

    # Test case 3: Multiple values
-    assert vals2arra('[AAA,BBB]') == ['AAA', 'BBB']
+    assert vals2arra("[AAA,BBB]") == ["AAA", "BBB"]

    # Test case 4: Multiple values with spaces and double quotes
-    assert vals2arra('["AAA","BBB"]') == ['AAA', 'BBB']
+    assert vals2arra('["AAA","BBB"]') == ["AAA", "BBB"]

    # Test case 5: Input is not a string
    assert vals2arra(None) == []

    # Test case 6: Input is '<NA>'
-    assert vals2arra('<NA>') == []
+    assert vals2arra("<NA>") == []

    # Test case 7: Input has missing closing bracket
    with pytest.raises(ValueError):
-        vals2arra('[AAA,BBB')
+        vals2arra("[AAA,BBB")

    # Test case 8: as_set=True
-    assert vals2arra('[AAA,BBB]', as_set=True) == {'AAA', 'BBB'}
+    assert vals2arra("[AAA,BBB]", as_set=True) == {"AAA", "BBB"}
+

 def test_territories2label():
    # Test case 1: Single value
-    assert territories2label('[IT,ES]') == 'ES_IT'
+    assert territories2label("[IT,ES]") == "ES_IT"

    # Test case 2: Multiple values
-    assert territories2label('[IT,ES,FR]') == 'ES_FR_IT'
+    assert territories2label("[IT,ES,FR]") == "ES_FR_IT"

    # Test case 3: Input is not a string
-    assert territories2label('') == ''
-    
\ No newline at end of file
+    assert territories2label("") == ""