diff --git a/.sagify.json b/.sagify.json index 2d4be0ea75a44c8f0632fa524f646c3a9da96f4f..5a539159354e37aabff667926e86a33291ee67f3 100644 --- a/.sagify.json +++ b/.sagify.json @@ -5,6 +5,6 @@ "python_version": "3.8", "requirements_dir": "requirements.txt", "sagify_module_dir": "src", - "experiment_id": "cpv_v0.0.1", + "experiment_id": "cpv_v0.0.2", "docker_image_base_url": "python/3.8" } diff --git a/ci/train.sh b/ci/train.sh index 7d884e5dcd9ccb0007a09e80a2f1fe055eb52d39..df3294c1b6669fb6deef4f9144c738aeb1adbbc2 100644 --- a/ci/train.sh +++ b/ci/train.sh @@ -33,7 +33,7 @@ job_arn=$( --role-arn "arn:aws:iam::528719223857:role/sagemaker_notebooks" \ --input-data-config '{"ChannelName":"training","DataSource":{"S3DataSource":{"S3DataType":"S3Prefix","S3Uri":"'"$s3_training_data_prefix"'","S3DataDistributionType":"FullyReplicated"}}}' \ --output-data-config "S3OutputPath=$s3_output_prefix" \ - --resource-config "InstanceType=ml.m5.large,InstanceCount=1,VolumeSizeInGB=30" \ + --resource-config "InstanceType=ml.m5.4xlarge,InstanceCount=1,VolumeSizeInGB=30" \ --stopping-condition "MaxRuntimeInSeconds=86400" \ --query TrainingJobArn \ --region eu-west-1 \ diff --git a/src/sagify_base/local_test/test_dir/input/config/hyperparameters.json b/src/sagify_base/local_test/test_dir/input/config/hyperparameters.json index c1ebf16a9c76c4248a75851db835c4741220bf3a..cbd2c5cb1a041dc111fc50e6ab134b300523aed9 100644 --- a/src/sagify_base/local_test/test_dir/input/config/hyperparameters.json +++ b/src/sagify_base/local_test/test_dir/input/config/hyperparameters.json @@ -1,3 +1,5 @@ { - "c_param": "15.25" + "c_param": "7.9165", + "min_df": "1", + "max_df": "0.2063" } diff --git a/src/sagify_base/prediction/prediction.py b/src/sagify_base/prediction/prediction.py index efc2e332e29fd8d9c577d16b5a94a14d030691fa..0f927c7eca4f201ff81212629a6f04825964b268 100644 --- a/src/sagify_base/prediction/prediction.py +++ b/src/sagify_base/prediction/prediction.py @@ -8,10 +8,9 @@ import re _MODEL_PATH = os.path.join('/opt/ml/', 'model') # Path where all your model(s) live in - -ALL_CPVS = ['85', '44', '50', '80', '73', '45', '71', '79', '90', '30', '35', '33', '55', '72', '48', '38', '09', - '75', '66', '64', '42', '34', '60', '92', '39', '31', '98', '51', '32', '65', '77', '22', '63', '15', - '70', '18', '03', '24', '43', '19', '41', '37', '14', '16', '76'] +ALL_CPVS = ['85', '44', '50', '80', '73', '45', '71', '79', '90', '30', '35', '33', '55', '72', '48', '38', '09', '75', + '66', '64', '42', '34', '60', '92', '39', '31', '98', '51', '32', '65', '77', '22', '63', '15', '70', '18', + '03', '24', '43', '19', '41', '37', '14', '16', '76'] CPV_MAPPING = { "03": "Agricultural, farming, fishing, forestry and related products", @@ -74,8 +73,6 @@ MONTHS = [" january ", " february ", " march ", " april ", " may ", " june ", " " jan ", " feb ", " mar ", " apr ", " jun ", " jul ", " aug ", " sep ", " oct ", " nov ", " dec "] - - def _remove_multiple_spaces(text: str) -> str: return re.sub('\s+', ' ', text) diff --git a/src/sagify_base/training/training.py b/src/sagify_base/training/training.py index b679b3ebbbcccb526d7cfab7a9fec62f37759822..0822eb4de0c9f3bbdd11cf4ac55695359043724a 100644 --- a/src/sagify_base/training/training.py +++ b/src/sagify_base/training/training.py @@ -63,25 +63,33 @@ def train(input_data_path: str, model_save_path: str, hyperparams_path: str = No raise ValueError(f"No file found in {input_data_folder}") raw_data = [pd.read_csv(file, index_col=0, engine="python") for file in input_files] df = pd.concat(raw_data) - print(f"Full dataset contains {len(df.index)} records.") + print(f"Full dataset contains {len(df)} records.") train_df, test_df = train_test_split(df, test_size=0.1) - x_train = train_df['title_texte'] + x_train = train_df['title_texte'].values.astype(str) y_train = train_df.drop(['title_texte'], axis=1) - print(f"Training set contains {len(train_df.index)} records.") + print(f"Training set contains {len(train_df)} records.") - x_test = test_df['title_texte'] + x_test = test_df['title_texte'].values.astype(str) y_test = test_df.drop(['title_texte'], axis=1) - print(f"Test set contains {len(x_test.index)} records.") + print(f"Test set contains {len(x_test)} records.") # Train model if "c_param" in hyperparameters.keys(): c_param = float(hyperparameters.get("c_param")) else: raise Exception(f"c_param missing from hyper parameters config") + if "min_df" in hyperparameters.keys(): + min_df = int(hyperparameters.get("min_df")) + else: + raise Exception(f"min_df missing from hyper parameters config") + if "max_df" in hyperparameters.keys(): + max_df = float(hyperparameters.get("max_df")) + else: + raise Exception(f"max_df missing from hyper parameters config") svc_pipeline = Pipeline([ - ('tfidf', TfidfVectorizer(ngram_range=(1, 3))), + ('tfidf', TfidfVectorizer(ngram_range=(1, 3), min_df=min_df, max_df=max_df)), ('clf', OneVsRestClassifier(LinearSVC(max_iter=10000, C=c_param, random_state=736283))), ]) svc_pipeline.fit(x_train, y_train)