Code development platform for open source projects from the European Union institutions

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ai4xml/playground
1 result
Show changes
Commits on Source (4)
......@@ -9,3 +9,6 @@ analyze_akn_datasets/__pycache__
Conval_API/ids.csv
prompt_engineering_experiments/evaluation_results/
prompt_engineering_experiments/data/*
dspy_programs/data
.DS_Store
%% Cell type:code id: tags:
``` python
!pip install tqdm
!pip install lxml
```
%% Output
Requirement already satisfied: tqdm in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (4.66.2)
Requirement already satisfied: lxml in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (5.2.2)
%% Cell type:code id: tags:
``` python
import os
from functions import *
```
%% Cell type:code id: tags:
``` python
cwd = os.getcwd()
documents_dir = os.path.join(f'{cwd}/data/genai4lex', 'Documents')
results_dir = os.path.join(cwd, 'results/genai4lex')
```
%% Cell type:markdown id: tags:
### download AKN documents from genai4lex repo
%% Cell type:code id: tags:
``` python
# Create results_dir if it does not exist
if not os.path.isdir(results_dir):
os.makedirs(results_dir)
```
%% Cell type:code id: tags:
``` python
# Check if the 'Documents' folder exists, if not, download and extract the zip file
if not os.path.isdir(documents_dir):
zip_url = "https://gitlab.com/CIRSFID/genai4lex/-/raw/main/LegalResources/Eur-Lex/2010-2021/Documents.zip?inline=false"
download_and_extract_zip(cwd, zip_url) # Ensure this function is defined to handle download and extraction
if not os.path.isdir(documents_dir):
print("Invalid directory path.")
exit()
```
%% Cell type:markdown id: tags:
### download AKN schema
%% Cell type:code id: tags:
``` python
schema_dir = os.path.join(cwd, 'schema')
os.makedirs(schema_dir, exist_ok=True)
# Download Akoma Ntoso Schema
schema_url = "https://docs.oasis-open.org/legaldocml/akn-core/v1.0/os/part2-specs/schemas/akomantoso30.xsd"
schema_path = os.path.join(schema_dir, 'akomantoso30.xsd')
if not os.path.exists(schema_path):
download_schema(schema_url, schema_path)
```
%% Cell type:markdown id: tags:
### Analysis and statistics
%% Cell type:code id: tags:
``` python
# Assuming analyze_xml_files and the associated functions are defined elsewhere
results, stats = analyze_xml_files(documents_dir,schema_path)
```
%% Output
Analyzing XML files: 100%|██████████| 15283/15283 [00:22<00:00, 674.22it/s]
%% Cell type:code id: tags:
``` python
output_csv_path = os.path.join(results_dir, 'results.csv')
# Write results to CSV
write_results_to_csv(results, output_csv_path) # Ensure this function is defined
stats_file_path = os.path.join(results_dir, 'statistics.csv')
write_stats_to_file(stats, stats_file_path) # Ensure this function is defined
```
%% Cell type:code id: tags:
``` python
print(stats)
```
%% Output
{'Average Total Pages': 4.1452594385919, 'Missing Total Pages': 0, 'Missing OJ Number': 0, 'Missing Publication Date': 0, 'Earliest Publication Date': '2010-01-05', 'Latest Publication Date': '2021-08-09'}
%% Cell type:markdown id: tags:
## Schema validation
%% Cell type:markdown id: tags:
### validate documents
%% Cell type:code id: tags:
``` python
results, valid, invalid = validate_xml_files(documents_dir, schema_path)
```
%% Output
Validating XML files: 100%|██████████| 15283/15283 [00:16<00:00, 934.79file/s]
%% Cell type:code id: tags:
``` python
output_csv_path = os.path.join(results_dir, 'validation_results.csv')
write_results_to_csv2(results, output_csv_path)
print(f"Validation results have been written to {output_csv_path}")
```
%% Output
Validation results have been written to /home/nasredine/dev/work/ai4xml/playground/analyze_akn_datasets/results/genai4lex/validation_results.csv
%% Cell type:code id: tags:
``` python
print(f'Number of valid files {valid}')
print(f'Number of invalid files {invalid}')
```
%% Output
Number of valid files 12759
Number of invalid files 2524
......
%% Cell type:code id: tags:
``` python
!pip install dspy-ai
!pip install python-dotenv
!pip install rouge-score
```
%% Cell type:code id: tags:
``` python
from dotenv import load_dotenv
import os
import json
```
%% Cell type:code id: tags:
``` python
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
```
%% Cell type:markdown id: tags:
## Step 1: Setup
%% Cell type:code id: tags:
``` python
import dspy
turbo = dspy.OpenAI(api_key=api_key, model='gpt-3.5-turbo')
dspy.settings.configure(lm=turbo)
```
%% Cell type:markdown id: tags:
## Step 2: Define Signatures
%% Cell type:code id: tags:
``` python
# class Document:
# """A simple document class to simulate the expected structure."""
# def __init__(self, text):
# self.sections = [Section(text)]
# class Section:
# """A section of the document."""
# def __init__(self, text):
# self.text = text
class GenerateAKN(dspy.Signature):
"""Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
text = dspy.InputField(desc="Raw text format of the document cover page")
xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input cover page")
```
%% Cell type:markdown id: tags:
## Step 3: Building the Transformation Pipeline
%% Cell type:code id: tags:
``` python
class DocumentToXMLPipeline(dspy.Module):
def __init__(self):
super().__init__()
self.transform = dspy.ChainOfThought(GenerateAKN)
def forward(self, text):
# Assuming there's some text to process, otherwise return an empty XML structure
if not text:
return ""
# Generate XML for the cover page
xml_cover_page = self.transform(text=text)
# Wrap in a root element
full_xml = f"<root>{xml_cover_page.xml}</root>"
return full_xml
```
%% Cell type:markdown id: tags:
## Step 4: Executing the Pipeline (0-shot conversion without optimization)
%% Cell type:code id: tags:
``` python
xml_pipeline = DocumentToXMLPipeline()
```
%% Cell type:code id: tags:
``` python
def process_documents(dataset):
cover_pages = [item['plain_text'] for item in dataset]
results = []
for doc in cover_pages:
xml_output = xml_pipeline(doc)
results.append(xml_output)
return results
```
%% Cell type:code id: tags:
``` python
# Running the pipeline with the example dataset
full_xml_outputs = process_documents(example_dataset)
for output in full_xml_outputs:
print(output)
```
%% Output
Prediction(
rationale='produce the xml. We will first identify the key elements of the cover page such as the title, date, proposal number, and the entities involved. We will then structure this information in the Akoma Ntoso (AKN) format.',
xml='```xml\n<coverPage>\n <title>Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL amending Regulation (EC) No 1008/2008 on common rules for the operation of air services in the Community</title>\n <date>21.12.2016</date>\n <proposalNumber>2016/0411 (COD)</proposalNumber>\n <entities>\n <entity type="author">'
)
Prediction(
rationale='produce the xml. We need to identify the key elements of the document cover page such as the title, date, file number, sender, recipient, and subject. We will then structure this information in the Akoma Ntoso (AKN) format.',
xml='```xml\n<coverPage>\n <title>COUNCIL OF THE EUROPEAN UNION</title>\n <date>27 February 2017</date>\n <language>en</language>\n <fileNumber>2016/0030 (COD)</fileNumber>\n <sender>General Secretariat of the Council</sender>\n <recipient>Permanent Representatives Committee</recipient>\n <subject>Proposal for a REGULATION OF THE EUROPEAN'
)
%% Cell type:markdown id: tags:
## Step 5: Optimizing the Pipeline
%% Cell type:code id: tags:
``` python
def load_data_from_json(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
return data
def prepare_example(text, expected_xml):
# Assuming 'dspy.Example' is the correct class from your DSPy framework
example = dspy.Example({
'text': text.strip(), # Using strip() to clean whitespace
'expected_xml': expected_xml.strip()
}).with_inputs("text")
return example
def create_dataset(data):
return [prepare_example(item['text'], item['expected_xml']) for item in data.values()]
```
%% Cell type:code id: tags:
``` python
file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
# Load and prepare the dataset
data = load_data_from_json(file_path)
trainset = create_dataset(data)
```
%% Cell type:code id: tags:
``` python
trainset
```
%% Output
[Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>Commission Delegated Directive (EU) 2020/12</p>\n <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/24</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n <p>authorising an extension of use of chia seeds (<span class="ITALIC">Salvia hispanica</span>) as a novel food and the change of the conditions of use and the specific labelling requirements of chia seeds (<span class="ITALIC">Salvia hispanica</span>) under Regulation (EU) 2015/2283 of the European Parliament and of the Council and amending Commission Implementing Regulation (EU) 2017/2470</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/23</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n <p>concerning the non-renewal of the approval of the active substance thiacloprid, in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market, and amending the Annex to Commission Implementing Regulation (EU) No 540/2011</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n </longTitle>\n </preface>'}) (input_keys={'text'})]
%% Cell type:code id: tags:
``` python
trainset[0]['expected_xml']
```
%% Output
'<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'
%% Cell type:code id: tags:
``` python
from rouge_score import rouge_scorer
```
%% Cell type:code id: tags:
``` python
def validate_xml_rouge_score(example, pred,trace=None):
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(example['expected_xml'], pred)
# Extracting the F1 scores from the results
rouge1_f1 = scores['rouge1'].fmeasure
rougeL_f1 = scores['rougeL'].fmeasure
print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
# Setting a threshold for ROUGE-L
return rougeL_f1 >= 0.0 # Threshold can be adjusted as needed
```
%% Cell type:code id: tags:
``` python
from dspy.teleprompt import BootstrapFewShot
teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
```
%% Output
100%|██████████| 2/2 [00:00<00:00, 34.16it/s]
ROUGE-1 F1: 0.430939226519337 | ROUGE-L F1: 0.34254143646408847
ROUGE-1 F1: 0.22372881355932203 | ROUGE-L F1: 0.21694915254237288
Bootstrapped 2 full traces after 2 examples in round 0.
%% Cell type:code id: tags:
``` python
!pip install dspy-ai
```
%% Output
Collecting dspy-ai
Downloading dspy_ai-2.4.12-py3-none-any.whl.metadata (38 kB)
Collecting backoff (from dspy-ai)
Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting datasets (from dspy-ai)
Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting joblib<=1.3.2 (from dspy-ai)
Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting openai<2.0.0,>=0.28.1 (from dspy-ai)
Downloading openai-1.35.14-py3-none-any.whl.metadata (21 kB)
Collecting optuna (from dspy-ai)
Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting pandas (from dspy-ai)
Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting pydantic~=2.0 (from dspy-ai)
Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 125.2/125.2 kB 8.6 MB/s eta 0:00:00
[?25hCollecting regex (from dspy-ai)
Downloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 3.6 MB/s eta 0:00:00
[?25hCollecting requests (from dspy-ai)
Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting structlog (from dspy-ai)
Downloading structlog-24.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting tqdm (from dspy-ai)
Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 57.6/57.6 kB 8.5 MB/s eta 0:00:00
[?25hCollecting ujson (from dspy-ai)
Downloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.3 kB)
Collecting anyio<5,>=3.5.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
Downloading anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting sniffio (from openai<2.0.0,>=0.28.1->dspy-ai)
Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from openai<2.0.0,>=0.28.1->dspy-ai) (4.12.2)
Collecting annotated-types>=0.4.0 (from pydantic~=2.0->dspy-ai)
Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.20.1 (from pydantic~=2.0->dspy-ai)
Downloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting filelock (from datasets->dspy-ai)
Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets->dspy-ai)
Downloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.9/60.9 kB 6.1 MB/s eta 0:00:00
[?25hCollecting pyarrow>=15.0.0 (from datasets->dspy-ai)
Downloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting pyarrow-hotfix (from datasets->dspy-ai)
Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->dspy-ai)
Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->dspy-ai)
Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from datasets->dspy-ai)
Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets->dspy-ai)
Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets->dspy-ai)
Downloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.5 kB)
Collecting huggingface-hub>=0.21.2 (from datasets->dspy-ai)
Downloading huggingface_hub-0.24.0-py3-none-any.whl.metadata (13 kB)
Requirement already satisfied: packaging in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from datasets->dspy-ai) (24.1)
Collecting pyyaml>=5.1 (from datasets->dspy-ai)
Downloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting charset-normalizer<4,>=2 (from requests->dspy-ai)
Downloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests->dspy-ai)
Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting urllib3<3,>=1.21.1 (from requests->dspy-ai)
Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
Collecting certifi>=2017.4.17 (from requests->dspy-ai)
Downloading certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)
Collecting alembic>=1.5.0 (from optuna->dspy-ai)
Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna->dspy-ai)
Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna->dspy-ai)
Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.6 kB)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from pandas->dspy-ai) (2.9.0.post0)
Collecting pytz>=2020.1 (from pandas->dspy-ai)
Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas->dspy-ai)
Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting Mako (from alembic>=1.5.0->optuna->dspy-ai)
Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Requirement already satisfied: exceptiongroup>=1.0.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=0.28.1->dspy-ai) (1.2.2)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets->dspy-ai)
Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting attrs>=17.3.0 (from aiohttp->datasets->dspy-ai)
Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->datasets->dspy-ai)
Downloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->datasets->dspy-ai)
Downloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.2 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->datasets->dspy-ai)
Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (31 kB)
Collecting async-timeout<5.0,>=4.0 (from aiohttp->datasets->dspy-ai)
Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Requirement already satisfied: six>=1.5 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->dspy-ai) (1.16.0)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna->dspy-ai)
Downloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.0 kB)
Downloading dspy_ai-2.4.12-py3-none-any.whl (276 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 276.3/276.3 kB 8.1 MB/s eta 0:00:00
[?25hDownloading joblib-1.3.2-py3-none-any.whl (302 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 302.2/302.2 kB 903.5 kB/s eta 0:00:00a 0:00:01
[?25hDownloading openai-1.35.14-py3-none-any.whl (328 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 328.5/328.5 kB 291.2 kB/s eta 0:00:00a 0:00:01
[?25hDownloading pydantic-2.8.2-py3-none-any.whl (423 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 423.9/423.9 kB 2.5 MB/s eta 0:00:00a 0:00:01
[?25hDownloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl (1.7 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 1.2 MB/s eta 0:00:0000:0100:01
[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.3/78.3 kB 1.3 MB/s eta 0:00:00a 0:00:01
[?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 547.8/547.8 kB 1.2 MB/s eta 0:00:0000:0100:01
[?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.9/64.9 kB 2.2 MB/s eta 0:00:00
[?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 380.1/380.1 kB 1.2 MB/s eta 0:00:00a 0:00:01
[?25hDownloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.3/11.3 MB 1.2 MB/s eta 0:00:0000:0100:01
[?25hDownloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl (278 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 278.3/278.3 kB 1.3 MB/s eta 0:00:0000:0100:01
[?25hDownloading structlog-24.4.0-py3-none-any.whl (67 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 67.2/67.2 kB 233.3 kB/s eta 0:00:000:00:01
[?25hDownloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl (51 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 51.8/51.8 kB 2.2 MB/s eta 0:00:00
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.0/233.0 kB 3.8 MB/s eta 0:00:00a 0:00:01
[?25hDownloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
Downloading anyio-4.4.0-py3-none-any.whl (86 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 2.3 MB/s eta 0:00:00
[?25hDownloading certifi-2024.7.4-py3-none-any.whl (162 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 163.0/163.0 kB 2.8 MB/s eta 0:00:00a 0:00:01
[?25hDownloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl (120 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 120.4/120.4 kB 101.0 kB/s eta 0:00:00 0:00:02
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 2.9 MB/s eta 0:00:00a 0:00:01
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 316.1/316.1 kB 4.9 MB/s eta 0:00:00a 0:00:01
[?25hDownloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl (390 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 390.7/390.7 kB 1.2 MB/s eta 0:00:00a 0:00:01m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 75.6/75.6 kB 2.4 MB/s eta 0:00:00
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 77.9/77.9 kB 1.3 MB/s eta 0:00:00a 0:00:01
[?25hDownloading huggingface_hub-0.24.0-py3-none-any.whl (419 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 419.0/419.0 kB 1.2 MB/s eta 0:00:00a 0:00:01
[?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.8/66.8 kB 1.9 MB/s eta 0:00:00
[?25hDownloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl (5.2 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.2/5.2 MB 1.2 MB/s eta 0:00:0000:0100:01
[?25hDownloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl (27.2 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 27.2/27.2 MB 1.1 MB/s eta 0:00:0000:0100:01m
[?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 505.5/505.5 kB 1.3 MB/s eta 0:00:00a 0:00:01
[?25hDownloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl (174 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 174.4/174.4 kB 1.3 MB/s eta 0:00:00a 0:00:01
[?25hDownloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 629.2 kB/s eta 0:00:0000:0100:01
[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 345.4/345.4 kB 720.8 kB/s eta 0:00:000:0100:01
[?25hDownloading urllib3-2.2.2-py3-none-any.whl (121 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.4/121.4 kB 2.8 MB/s eta 0:00:00a 0:00:01
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading filelock-3.15.4-py3-none-any.whl (16 kB)
Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 133.4/133.4 kB 4.3 MB/s eta 0:00:00
[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Downloading attrs-23.2.0-py3-none-any.whl (60 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.8/60.8 kB 3.9 MB/s eta 0:00:00
[?25hDownloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl (53 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.7/53.7 kB 4.9 MB/s eta 0:00:00
[?25hDownloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl (81 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 81.8/81.8 kB 5.0 MB/s eta 0:00:00
[?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.6/78.6 kB 4.8 MB/s eta 0:00:00
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.3/58.3 kB 1.2 MB/s eta 0:00:00a 0:00:01
[?25hDownloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl (18 kB)
Installing collected packages: pytz, xxhash, urllib3, ujson, tzdata, tqdm, structlog, sqlalchemy, sniffio, regex, pyyaml, pydantic-core, pyarrow-hotfix, numpy, multidict, MarkupSafe, joblib, idna, h11, fsspec, frozenlist, filelock, distro, dill, colorlog, charset-normalizer, certifi, backoff, attrs, async-timeout, annotated-types, yarl, requests, pydantic, pyarrow, pandas, multiprocess, Mako, httpcore, anyio, aiosignal, huggingface-hub, httpx, alembic, aiohttp, optuna, openai, datasets, dspy-ai
Successfully installed Mako-1.3.5 MarkupSafe-2.1.5 aiohttp-3.9.5 aiosignal-1.3.1 alembic-1.13.2 annotated-types-0.7.0 anyio-4.4.0 async-timeout-4.0.3 attrs-23.2.0 backoff-2.2.1 certifi-2024.7.4 charset-normalizer-3.3.2 colorlog-6.8.2 datasets-2.20.0 dill-0.3.8 distro-1.9.0 dspy-ai-2.4.12 filelock-3.15.4 frozenlist-1.4.1 fsspec-2024.5.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 huggingface-hub-0.24.0 idna-3.7 joblib-1.3.2 multidict-6.0.5 multiprocess-0.70.16 numpy-2.0.0 openai-1.35.14 optuna-3.6.1 pandas-2.2.2 pyarrow-17.0.0 pyarrow-hotfix-0.6 pydantic-2.8.2 pydantic-core-2.20.1 pytz-2024.1 pyyaml-6.0.1 regex-2024.5.15 requests-2.32.3 sniffio-1.3.1 sqlalchemy-2.0.31 structlog-24.4.0 tqdm-4.66.4 tzdata-2024.1 ujson-5.10.0 urllib3-2.2.2 xxhash-3.4.1 yarl-1.9.4
%% Cell type:code id: tags:
``` python
import dspy
turbo = dspy.OpenAI(model='gpt-3.5-turbo')
dspy.settings.configure(lm=turbo)
```
%% Output
/Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
warnings.warn(
/Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
%% Cell type:code id: tags:
``` python
class GenerateXML(dspy.Signature):
"""Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
section = dspy.InputField(desc="Raw text format of the document cover page")
xml = dspy.OutputField(desc="AKN XML representation of the input cover page")
```
%% Cell type:markdown id: tags:
downlod data
%% Cell type:code id: tags:
``` python
import requests
import zipfile
import os
```
%% Cell type:code id: tags:
``` python
def download_and_extract_zip(url, extract_to):
"""
Downloads a ZIP file from a URL and extracts it to a specified directory.
Only downloads and extracts if the directory does not already contain data.
"""
# Check if the directory already contains data
if os.listdir(extract_to):
print(f"Data already exists in {extract_to}. Skipping download.")
return
# Make sure the output directory exists
os.makedirs(extract_to, exist_ok=True)
# Get the content from the URL
response = requests.get(url)
response.raise_for_status() # Check that the request was successful
# Path to save the downloaded ZIP file
zip_path = os.path.join(extract_to, 'downloaded_files.zip')
# Write the content to a ZIP file
with open(zip_path, 'wb') as f:
f.write(response.content)
# Open the ZIP file and extract its contents
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
# Optionally, remove the ZIP file after extraction
os.remove(zip_path)
print(f"Files extracted to {extract_to}")
```
%% Cell type:code id: tags:
``` python
# URL of the file to be downloaded
s3_url = "https://ai4xml-data.s3.eu-west-1.amazonaws.com/planJO/selection_for_gen4ai/gen4ai_related_files.zip"
# Directory to store the extracted files
output_dir = 'data/genai4lex_word_docs/'
download_and_extract_zip(s3_url, output_dir)
```
%% Cell type:code id: tags:
``` python
from functions import *
```
%% Cell type:code id: tags:
``` python
# Specify the folder containing XML files and the output JSON file name
xml_folder = 'data/genai4lex_word_docs_xml'
output_json = 'prefaces.json'
extract_preface_content(xml_folder, output_json)
```
%% Cell type:markdown id: tags:
XML comparaison for test
replace rouge ??
%% Cell type:code id: tags:
``` python
!pip install xmldiff
```
%% Output
Requirement already satisfied: xmldiff in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (2.7.0)
Requirement already satisfied: setuptools in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (58.0.4)
Requirement already satisfied: lxml>=3.1.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (5.2.2)
%% Cell type:code id: tags:
``` python
from xmldiff import main, formatting
def compare_xml_content(xml1, xml2):
""" Compare two XML documents and return the differences. """
diffs = main.diff_texts(xml1, xml2, formatter=formatting.XMLFormatter())
return diffs
# Example XML documents
xml1 = """<root>
<child1 attribute="value1">Text1</child1>
<child2>Text2</child2>
</root>"""
xml2 = """<root>
<child1 attribute="value1">Text1</child1>
<child2>Text3</child2> <!-- Changed text -->
<child3>New child</child3> <!-- New element -->
</root>"""
# Get the difference
difference = compare_xml_content(xml1, xml2)
print("Differences:", difference)
```
%% Output
Differences: <root xmlns:diff="http://namespaces.shoobx.com/diff">
<child1 attribute="value1">Text1</child1>
<child2>Text<diff:delete>2</diff:delete><diff:insert>3</diff:insert></child2><diff:delete>
</diff:delete><diff:insert> </diff:insert><child3 diff:insert="">New child</child3><diff:insert> </diff:insert></root>
%% Cell type:code id: tags:
``` python
import xml.etree.ElementTree as ET
import logging
class XmlTree():
def __init__(self):
self.hdlr = logging.FileHandler('xml-comparison.log')
self.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
@staticmethod
def convert_string_to_tree( xmlString):
return ET.fromstring(xmlString)
def xml_compare(self, x1, x2, excludes=[]):
"""
Compares two xml etrees
:param x1: the first tree
:param x2: the second tree
:param excludes: list of string of attributes to exclude from comparison
:return:
True if both files match
"""
if x1.tag != x2.tag:
self.logger.debug('Tags do not match: %s and %s' % (x1.tag, x2.tag))
return False
for name, value in x1.attrib.items():
if not name in excludes:
if x2.attrib.get(name) != value:
self.logger.debug('Attributes do not match: %s=%r, %s=%r'
% (name, value, name, x2.attrib.get(name)))
return False
for name in x2.attrib.keys():
if not name in excludes:
if name not in x1.attrib:
self.logger.debug('x2 has an attribute x1 is missing: %s'
% name)
return False
if not self.text_compare(x1.text, x2.text):
self.logger.debug('text: %r != %r' % (x1.text, x2.text))
return False
if not self.text_compare(x1.tail, x2.tail):
self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
return False
cl1 = x1.getchildren()
cl2 = x2.getchildren()
if len(cl1) != len(cl2):
self.logger.debug('children length differs, %i != %i'
% (len(cl1), len(cl2)))
return False
i = 0
for c1, c2 in zip(cl1, cl2):
i += 1
if not c1.tag in excludes:
if not self.xml_compare(c1, c2, excludes):
self.logger.debug('children %i do not match: %s'
% (i, c1.tag))
return False
return True
def text_compare(self, t1, t2):
"""
Compare two text strings
:param t1: text one
:param t2: text two
:return:
True if a match
"""
if not t1 and not t2:
return True
if t1 == '*' or t2 == '*':
return True
return (t1 or '').strip() == (t2 or '').strip()
```
%% Cell type:code id: tags:
``` python
xml1 = "<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
xml2 = "<note><to>Tove</to><from>Daniel</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
tree1 = XmlTree.convert_string_to_tree(xml1)
tree2 = XmlTree.convert_string_to_tree(xml2)
comparator = XmlTree()
if comparator.xml_compare(tree1, tree2, ["from"]):
print ("XMLs match")
else:
print ("XMLs don't match")
```
%% Output
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[6], line 10
6 tree2 = XmlTree.convert_string_to_tree(xml2)
8 comparator = XmlTree()
---> 10 if comparator.xml_compare(tree1, tree2, ["from"]):
11 print ("XMLs match")
12 else:
Cell In[4], line 46, in XmlTree.xml_compare(self, x1, x2, excludes)
44 self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
45 return False
---> 46 cl1 = x1.getchildren()
47 cl2 = x2.getchildren()
48 if len(cl1) != len(cl2):
AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'getchildren'
%% Cell type:markdown id: tags:
# extract text using pydoc
%% Cell type:code id: tags:
``` python
# Set the path to your documents and the output JSON file
root_folder = '/Users/nasredine/dev/work/playground/dspy_programs/data/genai4lex_word_docs_xml'
output_json = 'md_text_output.json'
process_documents(root_folder, output_json)
```
%% Cell type:code id: tags:
``` python
# Example usage
xml_data = """<your xml string here>"""
md_data = """<your markdown string here>"""
result = extract_and_find(xml_data, md_data)
print(result)
```
%% Cell type:code id: tags:
``` python
!pip install dspy-ai
```
%% Output
Collecting dspy-ai
Downloading dspy_ai-2.4.12-py3-none-any.whl.metadata (38 kB)
Collecting backoff (from dspy-ai)
Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting datasets (from dspy-ai)
Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting joblib<=1.3.2 (from dspy-ai)
Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting openai<2.0.0,>=0.28.1 (from dspy-ai)
Downloading openai-1.35.14-py3-none-any.whl.metadata (21 kB)
Collecting optuna (from dspy-ai)
Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting pandas (from dspy-ai)
Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting pydantic~=2.0 (from dspy-ai)
Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 125.2/125.2 kB 8.6 MB/s eta 0:00:00
[?25hCollecting regex (from dspy-ai)
Downloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 3.6 MB/s eta 0:00:00
[?25hCollecting requests (from dspy-ai)
Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting structlog (from dspy-ai)
Downloading structlog-24.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting tqdm (from dspy-ai)
Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 57.6/57.6 kB 8.5 MB/s eta 0:00:00
[?25hCollecting ujson (from dspy-ai)
Downloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.3 kB)
Collecting anyio<5,>=3.5.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
Downloading anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting sniffio (from openai<2.0.0,>=0.28.1->dspy-ai)
Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from openai<2.0.0,>=0.28.1->dspy-ai) (4.12.2)
Collecting annotated-types>=0.4.0 (from pydantic~=2.0->dspy-ai)
Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.20.1 (from pydantic~=2.0->dspy-ai)
Downloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting filelock (from datasets->dspy-ai)
Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets->dspy-ai)
Downloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.9/60.9 kB 6.1 MB/s eta 0:00:00
[?25hCollecting pyarrow>=15.0.0 (from datasets->dspy-ai)
Downloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting pyarrow-hotfix (from datasets->dspy-ai)
Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->dspy-ai)
Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->dspy-ai)
Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from datasets->dspy-ai)
Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets->dspy-ai)
Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets->dspy-ai)
Downloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.5 kB)
Collecting huggingface-hub>=0.21.2 (from datasets->dspy-ai)
Downloading huggingface_hub-0.24.0-py3-none-any.whl.metadata (13 kB)
Requirement already satisfied: packaging in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from datasets->dspy-ai) (24.1)
Collecting pyyaml>=5.1 (from datasets->dspy-ai)
Downloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting charset-normalizer<4,>=2 (from requests->dspy-ai)
Downloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests->dspy-ai)
Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting urllib3<3,>=1.21.1 (from requests->dspy-ai)
Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
Collecting certifi>=2017.4.17 (from requests->dspy-ai)
Downloading certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)
Collecting alembic>=1.5.0 (from optuna->dspy-ai)
Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna->dspy-ai)
Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna->dspy-ai)
Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.6 kB)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from pandas->dspy-ai) (2.9.0.post0)
Collecting pytz>=2020.1 (from pandas->dspy-ai)
Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas->dspy-ai)
Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting Mako (from alembic>=1.5.0->optuna->dspy-ai)
Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Requirement already satisfied: exceptiongroup>=1.0.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=0.28.1->dspy-ai) (1.2.2)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets->dspy-ai)
Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting attrs>=17.3.0 (from aiohttp->datasets->dspy-ai)
Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting frozenlist>=1.1.1 (from aiohttp->datasets->dspy-ai)
Downloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->datasets->dspy-ai)
Downloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.2 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->datasets->dspy-ai)
Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (31 kB)
Collecting async-timeout<5.0,>=4.0 (from aiohttp->datasets->dspy-ai)
Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Requirement already satisfied: six>=1.5 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->dspy-ai) (1.16.0)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna->dspy-ai)
Downloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.0 kB)
Downloading dspy_ai-2.4.12-py3-none-any.whl (276 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 276.3/276.3 kB 8.1 MB/s eta 0:00:00
[?25hDownloading joblib-1.3.2-py3-none-any.whl (302 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 302.2/302.2 kB 903.5 kB/s eta 0:00:00a 0:00:01
[?25hDownloading openai-1.35.14-py3-none-any.whl (328 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 328.5/328.5 kB 291.2 kB/s eta 0:00:00a 0:00:01
[?25hDownloading pydantic-2.8.2-py3-none-any.whl (423 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 423.9/423.9 kB 2.5 MB/s eta 0:00:00a 0:00:01
[?25hDownloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl (1.7 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 1.2 MB/s eta 0:00:0000:0100:01
[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.3/78.3 kB 1.3 MB/s eta 0:00:00a 0:00:01
[?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 547.8/547.8 kB 1.2 MB/s eta 0:00:0000:0100:01
[?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.9/64.9 kB 2.2 MB/s eta 0:00:00
[?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 380.1/380.1 kB 1.2 MB/s eta 0:00:00a 0:00:01
[?25hDownloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.3/11.3 MB 1.2 MB/s eta 0:00:0000:0100:01
[?25hDownloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl (278 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 278.3/278.3 kB 1.3 MB/s eta 0:00:0000:0100:01
[?25hDownloading structlog-24.4.0-py3-none-any.whl (67 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 67.2/67.2 kB 233.3 kB/s eta 0:00:000:00:01
[?25hDownloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl (51 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 51.8/51.8 kB 2.2 MB/s eta 0:00:00
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.0/233.0 kB 3.8 MB/s eta 0:00:00a 0:00:01
[?25hDownloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
Downloading anyio-4.4.0-py3-none-any.whl (86 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 2.3 MB/s eta 0:00:00
[?25hDownloading certifi-2024.7.4-py3-none-any.whl (162 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 163.0/163.0 kB 2.8 MB/s eta 0:00:00a 0:00:01
[?25hDownloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl (120 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 120.4/120.4 kB 101.0 kB/s eta 0:00:00 0:00:02
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 2.9 MB/s eta 0:00:00a 0:00:01
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 316.1/316.1 kB 4.9 MB/s eta 0:00:00a 0:00:01
[?25hDownloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl (390 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 390.7/390.7 kB 1.2 MB/s eta 0:00:00a 0:00:01m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 75.6/75.6 kB 2.4 MB/s eta 0:00:00
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 77.9/77.9 kB 1.3 MB/s eta 0:00:00a 0:00:01
[?25hDownloading huggingface_hub-0.24.0-py3-none-any.whl (419 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 419.0/419.0 kB 1.2 MB/s eta 0:00:00a 0:00:01
[?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.8/66.8 kB 1.9 MB/s eta 0:00:00
[?25hDownloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl (5.2 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.2/5.2 MB 1.2 MB/s eta 0:00:0000:0100:01
[?25hDownloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl (27.2 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 27.2/27.2 MB 1.1 MB/s eta 0:00:0000:0100:01m
[?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 505.5/505.5 kB 1.3 MB/s eta 0:00:00a 0:00:01
[?25hDownloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl (174 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 174.4/174.4 kB 1.3 MB/s eta 0:00:00a 0:00:01
[?25hDownloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 629.2 kB/s eta 0:00:0000:0100:01
[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 345.4/345.4 kB 720.8 kB/s eta 0:00:000:0100:01
[?25hDownloading urllib3-2.2.2-py3-none-any.whl (121 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.4/121.4 kB 2.8 MB/s eta 0:00:00a 0:00:01
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading filelock-3.15.4-py3-none-any.whl (16 kB)
Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 133.4/133.4 kB 4.3 MB/s eta 0:00:00
[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Downloading attrs-23.2.0-py3-none-any.whl (60 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.8/60.8 kB 3.9 MB/s eta 0:00:00
[?25hDownloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl (53 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.7/53.7 kB 4.9 MB/s eta 0:00:00
[?25hDownloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl (81 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 81.8/81.8 kB 5.0 MB/s eta 0:00:00
[?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.6/78.6 kB 4.8 MB/s eta 0:00:00
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.3/58.3 kB 1.2 MB/s eta 0:00:00a 0:00:01
[?25hDownloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl (18 kB)
Installing collected packages: pytz, xxhash, urllib3, ujson, tzdata, tqdm, structlog, sqlalchemy, sniffio, regex, pyyaml, pydantic-core, pyarrow-hotfix, numpy, multidict, MarkupSafe, joblib, idna, h11, fsspec, frozenlist, filelock, distro, dill, colorlog, charset-normalizer, certifi, backoff, attrs, async-timeout, annotated-types, yarl, requests, pydantic, pyarrow, pandas, multiprocess, Mako, httpcore, anyio, aiosignal, huggingface-hub, httpx, alembic, aiohttp, optuna, openai, datasets, dspy-ai
Successfully installed Mako-1.3.5 MarkupSafe-2.1.5 aiohttp-3.9.5 aiosignal-1.3.1 alembic-1.13.2 annotated-types-0.7.0 anyio-4.4.0 async-timeout-4.0.3 attrs-23.2.0 backoff-2.2.1 certifi-2024.7.4 charset-normalizer-3.3.2 colorlog-6.8.2 datasets-2.20.0 dill-0.3.8 distro-1.9.0 dspy-ai-2.4.12 filelock-3.15.4 frozenlist-1.4.1 fsspec-2024.5.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 huggingface-hub-0.24.0 idna-3.7 joblib-1.3.2 multidict-6.0.5 multiprocess-0.70.16 numpy-2.0.0 openai-1.35.14 optuna-3.6.1 pandas-2.2.2 pyarrow-17.0.0 pyarrow-hotfix-0.6 pydantic-2.8.2 pydantic-core-2.20.1 pytz-2024.1 pyyaml-6.0.1 regex-2024.5.15 requests-2.32.3 sniffio-1.3.1 sqlalchemy-2.0.31 structlog-24.4.0 tqdm-4.66.4 tzdata-2024.1 ujson-5.10.0 urllib3-2.2.2 xxhash-3.4.1 yarl-1.9.4
%% Cell type:markdown id: tags:
## Step 1: Setup
%% Cell type:code id: tags:
``` python
import dspy
turbo = dspy.OpenAI(model='gpt-3.5-turbo')
dspy.settings.configure(lm=turbo)
```
%% Output
/Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
warnings.warn(
/Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
%% Cell type:markdown id: tags:
## Step 2: Define Signatures
%% Cell type:code id: tags:
``` python
class GenerateAKN(dspy.Signature):
"""Transform word document into Akoma ntoso (AKN) XML format."""
section = dspy.InputField(desc="Full document that contains title cover-page, body text, articles, etc")
xml = dspy.OutputField(desc="XML representation of the input section in AKN format")
```
%% Cell type:markdown id: tags:
## Step 3: Building the Transformation Pipeline
%% Cell type:code id: tags:
``` python
class DocumentToXMLPipeline(dspy.Module):
def __init__(self):
super().__init__()
self.transform = dspy.ChainOfThought(GenerateAKN)
def forward(self, document):
xml_parts = []
for section in document.sections:
xml_part = self.transform(section=section.text)
xml_parts.append(xml_part.xml)
full_xml = "<root>" + "".join(xml_parts) + "</root>"
return full_xml
```
%% Cell type:markdown id: tags:
## Step 4: Executing the Pipeline
%% Cell type:code id: tags:
``` python
document = {
"sections": [
{"title": "Cover Page", "text": "Document Title"},
{"title": "Introduction", "text": "Here is the introduction..."},
{"title": "Conclusion", "text": "Here is the conclusion..."}
]
}
xml_pipeline = DocumentToXMLPipeline()
full_xml_document = xml_pipeline(document)
print(full_xml_document)
```
import re
import os
import json
import subprocess
import xml.etree.ElementTree as ET
import requests
import zipfile
def normalize_text(text):
text = re.sub(r'\s+', ' ', text) # Replace multiple whitespace with single space
return text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
def extract_and_find(xml_string, md_string, length=30):
# Extract from XML
normalized_xml = normalize_text(xml_string)
start_segment = normalized_xml[:length]
end_segment = normalized_xml[-length:]
# Normalize MD
normalized_md = normalize_text(md_string)
# Find start and end segments in MD
start_index = normalized_md.find(start_segment)
end_index = normalized_md.rfind(end_segment)
if start_index != -1 and end_index != -1 and start_index < end_index:
return f"Text likely spans from index {start_index} to {end_index + length} in the Markdown file."
return "Matching text not found in Markdown."
def remove_namespaces(xml_element):
""" Recursively remove namespace prefixes from an XML element and its children. """
for elem in xml_element.iter():
if '}' in elem.tag:
elem.tag = elem.tag.split('}', 1)[1] # Removes namespace
# Update attributes to remove namespaces
attributes = list(elem.attrib.keys())
for attr in attributes:
if '}' in attr:
new_attr = attr.split('}', 1)[1]
elem.attrib[new_attr] = elem.attrib.pop(attr)
def remove_namespaces(xml_element):
""" Recursively remove namespace prefixes from an XML element and its children. """
for elem in xml_element.iter():
if '}' in elem.tag:
elem.tag = elem.tag.split('}', 1)[1] # Removes namespace
# Update attributes to remove namespaces
attributes = list(elem.attrib.keys())
for attr in attributes:
if '}' in attr:
new_attr = attr.split('}', 1)[1]
elem.attrib[new_attr] = elem.attrib.pop(attr)
def extract_preface_content(xml_folder, output_json):
results = {}
# Iterate over every XML file in the folder and its subfolders
for root_dir, sub_dirs, files in os.walk(xml_folder):
for filename in files:
if filename.endswith('.xml'):
# Remove the file extension from the filename
filename_no_ext = os.path.splitext(filename)[0]
file_path = os.path.join(root_dir, filename)
# Parse the XML file
tree = ET.parse(file_path)
root = tree.getroot()
# Remove namespaces from the root element
remove_namespaces(root)
# Extract the entire <preface> element
preface = root.find('.//preface')
if preface is not None:
# Convert the <preface> element to a string including its content
preface_xml = ET.tostring(preface, encoding='unicode')
results[filename_no_ext] = {
'celex_id': filename_no_ext,
'expected_xml': preface_xml,
'text': ""
}
else:
results[filename_no_ext] = {
'celex_id': filename_no_ext,
'expected_xml': "No preface found",
'text': ""
}
# Write results to a JSON file
with open(output_json, 'w') as json_file:
json.dump(results, json_file, indent=4)
json.dump(results, json_file, indent=4)
def convert_docx_to_md(docx_path):
"""Convert a DOCX file to Markdown using Pandoc."""
try:
# Run pandoc to convert the docx file to markdown
result = subprocess.run(['pandoc', '-f', 'docx', '-t', 'markdown', docx_path], capture_output=True, text=True)
return result.stdout
except subprocess.CalledProcessError as e:
print("An error occurred while converting DOCX to Markdown:", e)
return None
def process_documents(root_folder, output_json):
results = {}
# Walk through the directory structure
for root, dirs, files in os.walk(root_folder):
for file in files:
if file.endswith('.docx'):
celex_id = os.path.basename(root) # Assuming the parent folder is the CELEX ID
docx_path = os.path.join(root, file)
markdown_text = convert_docx_to_md(docx_path)
if markdown_text:
results[celex_id] = markdown_text
# Save results to a JSON file
with open(output_json, 'w') as json_file:
json.dump(results, json_file, indent=4)
def download_and_extract_zip(script_dir, zip_url):
print("The 'Documents' folder does not exist. Downloading and extracting the zip file...")
# Download the zip file
zip_file = os.path.join(script_dir, "Documents.zip")
response = requests.get(zip_url)
with open(zip_file, "wb") as f:
f.write(response.content)
# Extract the zip file
with zipfile.ZipFile(zip_file, "r") as zip_ref:
zip_ref.extractall(script_dir)
# Remove the downloaded zip file
os.remove(zip_file)
\ No newline at end of file
%% Cell type:code id: tags:
``` python
!pip install dspy-ai
!pip install python-dotenv
!pip install rouge-score
```
%% Output
13008.28s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
Requirement already satisfied: rouge-score in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (0.1.2)
Requirement already satisfied: absl-py in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (2.1.0)
Requirement already satisfied: nltk in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (3.8.1)
Requirement already satisfied: numpy in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (2.0.0)
Requirement already satisfied: six>=1.14.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (1.16.0)
Requirement already satisfied: click in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (8.1.7)
Requirement already satisfied: joblib in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (1.3.2)
Requirement already satisfied: regex>=2021.8.3 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (2024.5.15)
Requirement already satisfied: tqdm in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (4.66.4)
%% Cell type:code id: tags:
``` python
from dotenv import load_dotenv
import os
import json
import re
```
%% Cell type:code id: tags:
``` python
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
```
%% Cell type:markdown id: tags:
## Step 1: Setup
%% Cell type:code id: tags:
``` python
import dspy
turbo = dspy.OpenAI(api_key=api_key, model='gpt-4o-mini', max_tokens=10000)
dspy.settings.configure(lm=turbo)
```
%% Cell type:markdown id: tags:
## Step 2: Define Signatures
%% Cell type:code id: tags:
``` python
# class Document:
# """A simple document class to simulate the expected structure."""
# def __init__(self, text):
# self.sections = [Section(text)]
# class Section:
# """A section of the document."""
# def __init__(self, text):
# self.text = text
class GenerateAKN(dspy.Signature):
"""Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format."""
text = dspy.InputField(desc="Raw text format of the document prefece section")
xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input preface")
```
%% Cell type:markdown id: tags:
## Step 3: Building the Transformation Pipeline
%% Cell type:code id: tags:
``` python
class DocumentToXMLPipeline(dspy.Module):
def __init__(self):
super().__init__()
self.transform = dspy.ChainOfThought(GenerateAKN)
def extract_xml(self, content):
# This pattern looks for the <preface> tag and captures until </preface> including newlines and any characters between.
match = re.search(r'<preface>.*?</preface>', content, re.DOTALL)
if match:
return match.group(0)
return "" # Return empty string if no XML part is found
def forward(self, text):
# Assuming there's some text to process, otherwise return an empty XML structure
if not text:
return ""
# Generate XML for the cover page
xml_preface = self.transform(text=text)
# Extract the desired XML part
extracted_xml = self.extract_xml(xml_preface.xml)
# Return the extracted XML
return extracted_xml
```
%% Cell type:markdown id: tags:
## Step 4: Executing the Pipeline (0-shot conversion without optimization)
%% Cell type:code id: tags:
``` python
xml_pipeline = DocumentToXMLPipeline()
```
%% Cell type:code id: tags:
``` python
def process_documents(dataset):
preface = [item['plain_text'] for item in dataset]
results = []
for doc in preface:
xml_output = xml_pipeline(doc)
results.append(xml_output)
return results
```
%% Cell type:code id: tags:
``` python
# Running the pipeline with the example dataset
# full_xml_outputs = process_documents(example_dataset)
# for output in full_xml_outputs:
# print(output)
```
%% Cell type:markdown id: tags:
## Step 5: Optimizing the Pipeline
%% Cell type:code id: tags:
``` python
def load_data_from_json(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
return data
def prepare_example(text, xml):
# Assuming 'dspy.Example' is the correct class from your DSPy framework
example = dspy.Example({
'text': text.strip(), # Using strip() to clean whitespace
'xml': xml.strip()
}).with_inputs("text")
return example
def create_dataset(data):
return [prepare_example(item['text'], item['xml']) for item in data.values()]
```
%% Cell type:code id: tags:
``` python
file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
# Load and prepare the dataset
data = load_data_from_json(file_path)
dataset = create_dataset(data)
trainset = create_dataset(data)
```
%% Cell type:code id: tags:
``` python
trainset
```
%% Output
[Example({'text': '.\n\nCommission Delegated Regulation (EU) 2020/...\n\nof 29 October 2019\n\namending Regulation (EC) No 1272/2008 of the European Parliament and of\nthe Council on classification, labelling and packaging of substances and\nmixtures as regards information relating to emergency health response\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 14 January 2020\n\nconcerning the non-approval of *Vitis vinifera* cane tannins as a basic\nsubstance in accordance with Regulation (EC) No 1107/2009 of the\nEuropean Parliament and of the Council concerning the placing of plant\nprotection products on the market\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': 'Commission Delegated Directive (EU) 2019/...\n\nof 2 August 2019\n\nsupplementing Directive (EU) 2017/2397 of the European Parliament and of\nthe Council as regards the standards for competences and corresponding\nknowledge and skills, for the practical examinations, for the approval\nof simulators and for medical fitness\n\n(Text with EEA relevance)', 'xml': '<preface>\n <longTitle>\n <p>Commission Delegated Directive (EU) 2020/12</p>\n <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': 'UN Regulation No 53 --- Uniform provisions concerning the approval of\ncategory L~3~ vehicles with regard to the installation of lighting and\nlight-signalling devices \\[2019/\\...\\]**', 'xml': '<preface>\n <longTitle>\n <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': 'COMMISSION DELEGATED REGULATION (EU) 2020/...\n\nof [31 October 2019]{.mark}\n\namending Annexes I and III to Regulation (EU) 2019/631 of the European\nParliament and of the Council as regards the monitoring of CO~2~\nemissions from new light commercial vehicles type-approved in a\nmulti-stage process\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 13 January 2020\n\namending and correcting Regulation (EC) No 1235/2008 laying down\ndetailed rules for implementation of Council Regulation (EC) No 834/2007\nas regards the arrangements for imports of organic products from third\ncountries\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 14 January 2020\n\namending Implementing Regulation (EU) No 404/2011 as regards detailed\nrules for the direct electronic exchange of information enacted under\nthe rules of the Common Fisheries Policy', 'xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '**UN Regulation No 74 --- Uniform provisions concerning the approval of\ncategory L~1~ vehicles with regard to the installation of lighting and\nlight-signalling devices \\[2019/...\\]**', 'xml': '<preface>\n <longTitle>\n <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n </longTitle>\n </preface>'}) (input_keys={'text'})]
%% Cell type:code id: tags:
``` python
trainset[0]['xml']
```
%% Output
'<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'
%% Cell type:code id: tags:
``` python
from rouge_score import rouge_scorer
```
%% Cell type:code id: tags:
``` python
def validate_xml_rouge_score(example, pred,trace=None):
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(example['xml'], pred)
# Extracting the F1 scores from the results
rouge1_f1 = scores['rouge1'].fmeasure
rougeL_f1 = scores['rougeL'].fmeasure
print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
# Setting a threshold for ROUGE-L
return rougeL_f1 >= 0.7 # Threshold can be adjusted as needed
```
%% Cell type:code id: tags:
``` python
from dspy.teleprompt import BootstrapFewShot
teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
```
%% Output
50%|█████ | 4/8 [00:00<00:00, 187.45it/s]
ROUGE-1 F1: 0.9857142857142858 | ROUGE-L F1: 0.9857142857142858
ROUGE-1 F1: 0.9565217391304348 | ROUGE-L F1: 0.9565217391304348
ROUGE-1 F1: 0.9305555555555556 | ROUGE-L F1: 0.9305555555555556
ROUGE-1 F1: 0.9629629629629629 | ROUGE-L F1: 0.9629629629629629
Bootstrapped 4 full traces after 5 examples in round 0.
%% Cell type:code id: tags:
``` python
compiled_pipeline.save("prefaces.prog.json")
```
%% Output
[('transform', Predict(StringSignature(text -> rationale, xml
instructions='Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format.'
text = Field(annotation=str required=True json_schema_extra={'desc': 'Raw text format of the document prefece section', '__dspy_field_type': 'input', 'prefix': 'Text:'})
rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the xml}. We ...', '__dspy_field_type': 'output'})
xml = Field(annotation=str required=True json_schema_extra={'desc': 'Akoma Ntoso (AKN) XML representation of the input preface', '__dspy_field_type': 'output', 'prefix': 'Xml:'})
)))]
%% Cell type:code id: tags:
``` python
text = "COMMISSION IMPLEMENTING REGULATION (EU) 2021/...\n\nof 13 January 2021\n\namending and correcting Regulation (EC) No 1235/2009 laying down\ndetailed rules for implementation of Council Regulation (EC) No 834/2008\nas regards the arrangements for imports of electrical products from third\ncountries\n\n**(Text with EEA relevance)**\n\n"
```
%% Cell type:code id: tags:
``` python
xml = compiled_pipeline(text)
```
%% Cell type:code id: tags:
``` python
xml
```
%% Output
'<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2021/...</docNumber></span>\n </p>\n <p>of<docDate date="2021-01-13">13 January 2021</docDate></p>\n <p>amending and correcting Regulation (EC) No 1235/2009 laying down detailed rules for implementation of Council Regulation (EC) No 834/2008 as regards the arrangements for imports of electrical products from third countries</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n</preface>'
%% Cell type:markdown id: tags:
'```xml\n<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2021/...</docNumber></span>\n </p>\n <p>of<docDate date="2021-01-13">13 January 2021</docDate></p>\n <p>amending and correcting Regulation (EC) No 1235/2009 laying down detailed rules for implementation of Council Regulation (EC) No 834/2008 as regards the arrangements for imports of electrical products from third countries</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n</preface>\n```'
%% Cell type:markdown id: tags:
<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n </p>\n <p>of<docDate date=\"2020-01-13\">13 January 2020</docDate></p>\n <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>\n
%% Cell type:code id: tags:
``` python
compiled_pipeline
```
%% Output
transform = Predict(StringSignature(text -> rationale, xml
instructions='Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format.'
text = Field(annotation=str required=True json_schema_extra={'desc': 'Raw text format of the document prefece section', '__dspy_field_type': 'input', 'prefix': 'Text:'})
rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the xml}. We ...', '__dspy_field_type': 'output'})
xml = Field(annotation=str required=True json_schema_extra={'desc': 'Akoma Ntoso (AKN) XML representation of the input preface', '__dspy_field_type': 'output', 'prefix': 'Xml:'})
))