Code development platform for open source projects from the European Union institutions :large_blue_circle: EU Login authentication by SMS has been phased out. To see alternatives please check here

Skip to content
Snippets Groups Projects
Commit 84461f29 authored by Lionel Weicker's avatar Lionel Weicker
Browse files

AITED-163

parent 40d2456f
Branches
Tags
No related merge requests found
.venv/
\ No newline at end of file
import random
import json
import sys
from pathlib import Path
from spacy.util import minibatch
import numpy
import spacy
from spacy.util import compounding
import tqdm
if not spacy.__version__.startswith("2."):
print("spacy version must be 2.X.X for the training. If you want to user a recent version of spacy, you first need to convert the dataset accordingly.")
sys.exit(1)
OUTPUT = "./is_contract_budget"
spacy_model = "en_core_web_lg"
dropout = 0.2
seed = 0
n_iter = 10
random.seed(seed)
numpy.random.seed(seed)
if spacy_model.startswith("blank:"):
nlp = spacy.blank(spacy_model.replace("blank:", ""))
else:
nlp = spacy.load(spacy_model)
nlp.add_pipe(nlp.create_pipe("textcat", config={"exclusive_classes": False}), last=True)
pipe = nlp.get_pipe("textcat")
pipe.add_label("Contract")
disabled = nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"])
optimizer = nlp.begin_training(component_cfg={'textcat': {'exclusive_classes': False}})
batch_size = compounding(1.0, 16.0, 1.001)
best_scores = None
best_model = None
with open("./20230525-annotations_of_extracted_budgetary_values_with_context.json", "r") as file:
full_dataset = json.load(file)
eval_split = 0.5
random.shuffle(full_dataset)
test_dataset = full_dataset[:int(len(full_dataset) * eval_split)]
train_dataset = full_dataset[len(test_dataset):]
baseline = nlp.evaluate(test_dataset)
print(f"Baseline F-Score: {baseline.textcat_score}")
print(f"iter \t loss \t F1")
for i in range(n_iter):
random.shuffle(train_dataset)
losses = {}
data = tqdm.tqdm(train_dataset, leave=False, desc=f"{i + 1}")
for batch in minibatch(data, size=batch_size):
docs, annots = zip(*batch)
nlp.update(docs, annots, drop=dropout, losses=losses)
with nlp.use_params(optimizer.averages):
scores = nlp.evaluate(test_dataset)
print(f"{i} \t {losses['textcat']} \t {scores.textcat_score}")
if not best_scores or scores.textcat_score > best_scores.textcat_score:
best_scores = scores
best_model = pipe.to_bytes(exclude=["vocab"])
print(f"Best F1 score: {best_scores.textcat_score}")
output_path = Path(OUTPUT)
pipe.from_bytes(best_model)
if disabled:
disabled.restore()
nlp.to_disk(output_path)
This diff is collapsed.
This diff is collapsed.
# budgetary-value-extractor # budgetary-value-extractor
Datasets are available in s3://d-ew1-ted-ai-experiments-data/budgetary-values/
Python 3.9
Install requirements from `requirements.txt` file
matplotlib==3.7.1
pandas==2.0.2
spacy==2.3.9
tika==2.6.0
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment