AITED-163

84461f29 · Lionel Weicker · 40d2456f · 84461f29 · 84461f29 · 84461f29
Commit 84461f29 authored 2 years ago by Lionel Weicker
--- a/.gitignore
+++ b/.gitignore
+.venv/
\ No newline at end of file
--- a/20230515-extract_budgetary_values_from_etendering_documents_using_ner_models.ipynb
+++ b/20230515-extract_budgetary_values_from_etendering_documents_using_ner_models.ipynb
--- a/20230522-extract_budgetary_values_from_etendering_documents_using_simple_rules.ipynb
+++ b/20230522-extract_budgetary_values_from_etendering_documents_using_simple_rules.ipynb
--- a/20230526-train_classifier_is_contract_value.py
+++ b/20230526-train_classifier_is_contract_value.py
+import random
+import json
+import sys
+from pathlib import Path
+from spacy.util import minibatch
+import numpy
+import spacy
+from spacy.util import compounding
+import tqdm
+if not spacy.__version__.startswith("2."):
+    print("spacy version must be 2.X.X for the training. If you want to user a recent version of spacy, you first need to convert the dataset accordingly.")
+    sys.exit(1)
+OUTPUT = "./is_contract_budget"
+spacy_model = "en_core_web_lg"
+dropout = 0.2
+seed = 0
+n_iter = 10
+random.seed(seed)
+numpy.random.seed(seed)
+if spacy_model.startswith("blank:"):
+    nlp = spacy.blank(spacy_model.replace("blank:", ""))
+else:
+    nlp = spacy.load(spacy_model)
+nlp.add_pipe(nlp.create_pipe("textcat", config={"exclusive_classes": False}), last=True)
+pipe = nlp.get_pipe("textcat")
+pipe.add_label("Contract")
+disabled = nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"])
+optimizer = nlp.begin_training(component_cfg={'textcat': {'exclusive_classes': False}})
+batch_size = compounding(1.0, 16.0, 1.001)
+best_scores = None
+best_model = None
+with open("./20230525-annotations_of_extracted_budgetary_values_with_context.json", "r") as file:
+    full_dataset = json.load(file)
+eval_split = 0.5
+random.shuffle(full_dataset)
+test_dataset = full_dataset[:int(len(full_dataset) * eval_split)]
+train_dataset = full_dataset[len(test_dataset):]
+baseline = nlp.evaluate(test_dataset)
+print(f"Baseline F-Score: {baseline.textcat_score}")
+print(f"iter \t loss \t F1")
+for i in range(n_iter):
+    random.shuffle(train_dataset)
+    losses = {}
+    data = tqdm.tqdm(train_dataset, leave=False, desc=f"{i + 1}")
+    for batch in minibatch(data, size=batch_size):
+        docs, annots = zip(*batch)
+        nlp.update(docs, annots, drop=dropout, losses=losses)
+    with nlp.use_params(optimizer.averages):
+        scores = nlp.evaluate(test_dataset)
+        print(f"{i} \t {losses['textcat']} \t {scores.textcat_score}")
+        if not best_scores or scores.textcat_score > best_scores.textcat_score:
+            best_scores = scores
+            best_model = pipe.to_bytes(exclude=["vocab"])
+print(f"Best F1 score: {best_scores.textcat_score}")
+output_path = Path(OUTPUT)
+pipe.from_bytes(best_model)
+if disabled:
+    disabled.restore()
+nlp.to_disk(output_path)
--- a/20230529-classify_context_of_budgetary_value_as_contract_value.ipynb
+++ b/20230529-classify_context_of_budgetary_value_as_contract_value.ipynb
--- a/20230529-end_to_end_budgetary_value_prediction_from_input_file.ipynb
+++ b/20230529-end_to_end_budgetary_value_prediction_from_input_file.ipynb
--- a/README.md
+++ b/README.md
 # budgetary-value-extractor
+Datasets are available in s3://d-ew1-ted-ai-experiments-data/budgetary-values/
+Python 3.9
+Install requirements from `requirements.txt` file
--- a/requirements.txt
+++ b/requirements.txt
+matplotlib==3.7.1
+pandas==2.0.2
+spacy==2.3.9
+tika==2.6.0
\ No newline at end of file