Code development platform for open source projects from the European Union institutions

Skip to content
Snippets Groups Projects
Commit aa0e15dd authored by Lionel Weicker's avatar Lionel Weicker
Browse files

Merge branch 'full_scope_en_notices' into 'main'

all notices in EN

See merge request !5
parents 6bc06d9a 104432b7
Branches main
1 merge request!5all notices in EN
Pipeline #111933 passed
......@@ -5,6 +5,6 @@
"python_version": "3.8",
"requirements_dir": "requirements.txt",
"sagify_module_dir": "src",
"experiment_id": "cpv_v0.0.1",
"experiment_id": "cpv_v0.0.2",
"docker_image_base_url": "python/3.8"
}
......@@ -33,7 +33,7 @@ job_arn=$(
--role-arn "arn:aws:iam::528719223857:role/sagemaker_notebooks" \
--input-data-config '{"ChannelName":"training","DataSource":{"S3DataSource":{"S3DataType":"S3Prefix","S3Uri":"'"$s3_training_data_prefix"'","S3DataDistributionType":"FullyReplicated"}}}' \
--output-data-config "S3OutputPath=$s3_output_prefix" \
--resource-config "InstanceType=ml.m5.large,InstanceCount=1,VolumeSizeInGB=30" \
--resource-config "InstanceType=ml.m5.4xlarge,InstanceCount=1,VolumeSizeInGB=30" \
--stopping-condition "MaxRuntimeInSeconds=86400" \
--query TrainingJobArn \
--region eu-west-1 \
......
{
"c_param": "15.25"
"c_param": "7.9165",
"min_df": "1",
"max_df": "0.2063"
}
......@@ -8,10 +8,9 @@ import re
_MODEL_PATH = os.path.join('/opt/ml/', 'model') # Path where all your model(s) live in
ALL_CPVS = ['85', '44', '50', '80', '73', '45', '71', '79', '90', '30', '35', '33', '55', '72', '48', '38', '09',
'75', '66', '64', '42', '34', '60', '92', '39', '31', '98', '51', '32', '65', '77', '22', '63', '15',
'70', '18', '03', '24', '43', '19', '41', '37', '14', '16', '76']
ALL_CPVS = ['85', '44', '50', '80', '73', '45', '71', '79', '90', '30', '35', '33', '55', '72', '48', '38', '09', '75',
'66', '64', '42', '34', '60', '92', '39', '31', '98', '51', '32', '65', '77', '22', '63', '15', '70', '18',
'03', '24', '43', '19', '41', '37', '14', '16', '76']
CPV_MAPPING = {
"03": "Agricultural, farming, fishing, forestry and related products",
......@@ -74,8 +73,6 @@ MONTHS = [" january ", " february ", " march ", " april ", " may ", " june ", "
" jan ", " feb ", " mar ", " apr ", " jun ", " jul ", " aug ", " sep ", " oct ", " nov ", " dec "]
def _remove_multiple_spaces(text: str) -> str:
return re.sub('\s+', ' ', text)
......
......@@ -63,25 +63,33 @@ def train(input_data_path: str, model_save_path: str, hyperparams_path: str = No
raise ValueError(f"No file found in {input_data_folder}")
raw_data = [pd.read_csv(file, index_col=0, engine="python") for file in input_files]
df = pd.concat(raw_data)
print(f"Full dataset contains {len(df.index)} records.")
print(f"Full dataset contains {len(df)} records.")
train_df, test_df = train_test_split(df, test_size=0.1)
x_train = train_df['title_texte']
x_train = train_df['title_texte'].values.astype(str)
y_train = train_df.drop(['title_texte'], axis=1)
print(f"Training set contains {len(train_df.index)} records.")
print(f"Training set contains {len(train_df)} records.")
x_test = test_df['title_texte']
x_test = test_df['title_texte'].values.astype(str)
y_test = test_df.drop(['title_texte'], axis=1)
print(f"Test set contains {len(x_test.index)} records.")
print(f"Test set contains {len(x_test)} records.")
# Train model
if "c_param" in hyperparameters.keys():
c_param = float(hyperparameters.get("c_param"))
else:
raise Exception(f"c_param missing from hyper parameters config")
if "min_df" in hyperparameters.keys():
min_df = int(hyperparameters.get("min_df"))
else:
raise Exception(f"min_df missing from hyper parameters config")
if "max_df" in hyperparameters.keys():
max_df = float(hyperparameters.get("max_df"))
else:
raise Exception(f"max_df missing from hyper parameters config")
svc_pipeline = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
('tfidf', TfidfVectorizer(ngram_range=(1, 3), min_df=min_df, max_df=max_df)),
('clf', OneVsRestClassifier(LinearSVC(max_iter=10000, C=c_param, random_state=736283))),
])
svc_pipeline.fit(x_train, y_train)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment