Compare revisions

Nasredine · Nasredine · Nasredine · my_mac_access_token · a3fa8bbb · a3fa8bbb
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,6 @@ analyze_akn_datasets/__pycache__
 Conval_API/ids.csv
 prompt_engineering_experiments/evaluation_results/
 prompt_engineering_experiments/data/*
+dspy_programs/data
+.DS_Store
+
--- a/analyze_akn_datasets/analyze_eur-lex_genai4lex_dataset.ipynb
+++ b/analyze_akn_datasets/analyze_eur-lex_genai4lex_dataset.ipynb
 %% Cell type:code id: tags:

 ``` python
 !pip install tqdm
 !pip install lxml
 ```

 %% Output

    Requirement already satisfied: tqdm in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (4.66.2)
    Requirement already satisfied: lxml in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (5.2.2)

 %% Cell type:code id: tags:

 ``` python
 import os
 from functions import *
 ```

 %% Cell type:code id: tags:

 ``` python
 cwd = os.getcwd()
 documents_dir = os.path.join(f'{cwd}/data/genai4lex', 'Documents')
 results_dir = os.path.join(cwd, 'results/genai4lex')
 ```

 %% Cell type:markdown id: tags:

 ### download AKN documents from genai4lex repo

 %% Cell type:code id: tags:

 ``` python
 # Create results_dir if it does not exist
 if not os.path.isdir(results_dir):
    os.makedirs(results_dir)
 ```

 %% Cell type:code id: tags:

 ``` python

 # Check if the 'Documents' folder exists, if not, download and extract the zip file
 if not os.path.isdir(documents_dir):
    zip_url = "https://gitlab.com/CIRSFID/genai4lex/-/raw/main/LegalResources/Eur-Lex/2010-2021/Documents.zip?inline=false"
    download_and_extract_zip(cwd, zip_url)  # Ensure this function is defined to handle download and extraction

 if not os.path.isdir(documents_dir):
    print("Invalid directory path.")
    exit()
 ```

 %% Cell type:markdown id: tags:

 ### download AKN schema

 %% Cell type:code id: tags:

 ``` python

 schema_dir = os.path.join(cwd, 'schema')
 os.makedirs(schema_dir, exist_ok=True)

 # Download Akoma Ntoso Schema
 schema_url = "https://docs.oasis-open.org/legaldocml/akn-core/v1.0/os/part2-specs/schemas/akomantoso30.xsd"
 schema_path = os.path.join(schema_dir, 'akomantoso30.xsd')
 if not os.path.exists(schema_path):
    download_schema(schema_url, schema_path)
 ```

 %% Cell type:markdown id: tags:

 ### Analysis and statistics

 %% Cell type:code id: tags:

 ``` python
-
 # Assuming analyze_xml_files and the associated functions are defined elsewhere
 results, stats = analyze_xml_files(documents_dir,schema_path)
-
 ```

 %% Output

    Analyzing XML files: 100%|██████████| 15283/15283 [00:22<00:00, 674.22it/s]

 %% Cell type:code id: tags:

 ``` python
 output_csv_path = os.path.join(results_dir, 'results.csv')

 # Write results to CSV
 write_results_to_csv(results, output_csv_path)  # Ensure this function is defined


 stats_file_path = os.path.join(results_dir, 'statistics.csv')

 write_stats_to_file(stats, stats_file_path)  # Ensure this function is defined
 ```

 %% Cell type:code id: tags:

 ``` python
 print(stats)
 ```

 %% Output

    {'Average Total Pages': 4.1452594385919, 'Missing Total Pages': 0, 'Missing OJ Number': 0, 'Missing Publication Date': 0, 'Earliest Publication Date': '2010-01-05', 'Latest Publication Date': '2021-08-09'}

 %% Cell type:markdown id: tags:

 ## Schema validation

 %% Cell type:markdown id: tags:

 ### validate documents

 %% Cell type:code id: tags:

 ``` python
 results, valid, invalid = validate_xml_files(documents_dir, schema_path)
 ```

 %% Output

    Validating XML files: 100%|██████████| 15283/15283 [00:16<00:00, 934.79file/s]

 %% Cell type:code id: tags:

 ``` python
 output_csv_path = os.path.join(results_dir, 'validation_results.csv')
 write_results_to_csv2(results, output_csv_path)
 print(f"Validation results have been written to {output_csv_path}")
 ```

 %% Output

    Validation results have been written to /home/nasredine/dev/work/ai4xml/playground/analyze_akn_datasets/results/genai4lex/validation_results.csv

 %% Cell type:code id: tags:

 ``` python
 print(f'Number of valid files {valid}')
 print(f'Number of invalid files {invalid}')
 ```

 %% Output

    Number of valid files 12759
    Number of invalid files 2524

 %% Cell type:code id: tags:

 ``` python
 !pip install tqdm
 !pip install lxml
 ```

 %% Output

    Requirement already satisfied: tqdm in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (4.66.2)
    Requirement already satisfied: lxml in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (5.2.2)

 %% Cell type:code id: tags:

 ``` python
 import os
 from functions import *
 ```

 %% Cell type:code id: tags:

 ``` python
 cwd = os.getcwd()
 documents_dir = os.path.join(f'{cwd}/data/genai4lex', 'Documents')
 results_dir = os.path.join(cwd, 'results/genai4lex')
 ```

 %% Cell type:markdown id: tags:

 ### download AKN documents from genai4lex repo

 %% Cell type:code id: tags:

 ``` python
 # Create results_dir if it does not exist
 if not os.path.isdir(results_dir):
    os.makedirs(results_dir)
 ```

 %% Cell type:code id: tags:

 ``` python

 # Check if the 'Documents' folder exists, if not, download and extract the zip file
 if not os.path.isdir(documents_dir):
    zip_url = "https://gitlab.com/CIRSFID/genai4lex/-/raw/main/LegalResources/Eur-Lex/2010-2021/Documents.zip?inline=false"
    download_and_extract_zip(cwd, zip_url)  # Ensure this function is defined to handle download and extraction

 if not os.path.isdir(documents_dir):
    print("Invalid directory path.")
    exit()
 ```

 %% Cell type:markdown id: tags:

 ### download AKN schema

 %% Cell type:code id: tags:

 ``` python

 schema_dir = os.path.join(cwd, 'schema')
 os.makedirs(schema_dir, exist_ok=True)

 # Download Akoma Ntoso Schema
 schema_url = "https://docs.oasis-open.org/legaldocml/akn-core/v1.0/os/part2-specs/schemas/akomantoso30.xsd"
 schema_path = os.path.join(schema_dir, 'akomantoso30.xsd')
 if not os.path.exists(schema_path):
    download_schema(schema_url, schema_path)
 ```

 %% Cell type:markdown id: tags:

 ### Analysis and statistics

 %% Cell type:code id: tags:

 ``` python
-
 # Assuming analyze_xml_files and the associated functions are defined elsewhere
 results, stats = analyze_xml_files(documents_dir,schema_path)
-
 ```

 %% Output

    Analyzing XML files: 100%|██████████| 15283/15283 [00:22<00:00, 674.22it/s]

 %% Cell type:code id: tags:

 ``` python
 output_csv_path = os.path.join(results_dir, 'results.csv')

 # Write results to CSV
 write_results_to_csv(results, output_csv_path)  # Ensure this function is defined


 stats_file_path = os.path.join(results_dir, 'statistics.csv')

 write_stats_to_file(stats, stats_file_path)  # Ensure this function is defined
 ```

 %% Cell type:code id: tags:

 ``` python
 print(stats)
 ```

 %% Output

    {'Average Total Pages': 4.1452594385919, 'Missing Total Pages': 0, 'Missing OJ Number': 0, 'Missing Publication Date': 0, 'Earliest Publication Date': '2010-01-05', 'Latest Publication Date': '2021-08-09'}

 %% Cell type:markdown id: tags:

 ## Schema validation

 %% Cell type:markdown id: tags:

 ### validate documents

 %% Cell type:code id: tags:

 ``` python
 results, valid, invalid = validate_xml_files(documents_dir, schema_path)
 ```

 %% Output

    Validating XML files: 100%|██████████| 15283/15283 [00:16<00:00, 934.79file/s]

 %% Cell type:code id: tags:

 ``` python
 output_csv_path = os.path.join(results_dir, 'validation_results.csv')
 write_results_to_csv2(results, output_csv_path)
 print(f"Validation results have been written to {output_csv_path}")
 ```

 %% Output

    Validation results have been written to /home/nasredine/dev/work/ai4xml/playground/analyze_akn_datasets/results/genai4lex/validation_results.csv

 %% Cell type:code id: tags:

 ``` python
 print(f'Number of valid files {valid}')
 print(f'Number of invalid files {invalid}')
 ```

 %% Output

    Number of valid files 12759
    Number of invalid files 2524

--- a/dspy_programs/cover_page.prog.ipynb
+++ b/dspy_programs/cover_page.prog.ipynb
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+!pip install python-dotenv
+!pip install rouge-score
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dotenv import load_dotenv
+import os
+import json
+```
+
+%% Cell type:code id: tags:
+
+``` python
+load_dotenv()
+api_key = os.getenv('OPENAI_API_KEY')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 1: Setup
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(api_key=api_key, model='gpt-3.5-turbo')
+dspy.settings.configure(lm=turbo)
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 2: Define Signatures
+
+%% Cell type:code id: tags:
+
+``` python
+# class Document:
+#     """A simple document class to simulate the expected structure."""
+#     def __init__(self, text):
+#         self.sections = [Section(text)]
+
+# class Section:
+#     """A section of the document."""
+#     def __init__(self, text):
+#         self.text = text
+
+class GenerateAKN(dspy.Signature):
+    """Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
+    text = dspy.InputField(desc="Raw text format of the document cover page")
+    xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input cover page")
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3: Building the Transformation Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+class DocumentToXMLPipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.transform = dspy.ChainOfThought(GenerateAKN)
+
+    def forward(self, text):
+        # Assuming there's some text to process, otherwise return an empty XML structure
+        if not text:
+            return ""
+
+        # Generate XML for the cover page
+        xml_cover_page = self.transform(text=text)
+
+        # Wrap in a root element
+        full_xml = f"<root>{xml_cover_page.xml}</root>"
+        return full_xml
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4: Executing the Pipeline (0-shot conversion without optimization)
+
+%% Cell type:code id: tags:
+
+``` python
+xml_pipeline = DocumentToXMLPipeline()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def process_documents(dataset):
+    cover_pages = [item['plain_text'] for item in dataset]
+    results = []
+
+    for doc in cover_pages:
+        xml_output = xml_pipeline(doc)
+        results.append(xml_output)
+
+    return results
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Running the pipeline with the example dataset
+full_xml_outputs = process_documents(example_dataset)
+for output in full_xml_outputs:
+    print(output)
+```
+
+%% Output
+
+    Prediction(
+        rationale='produce the xml. We will first identify the key elements of the cover page such as the title, date, proposal number, and the entities involved. We will then structure this information in the Akoma Ntoso (AKN) format.',
+        xml='```xml\n<coverPage>\n    <title>Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL amending Regulation (EC) No 1008/2008 on common rules for the operation of air services in the Community</title>\n    <date>21.12.2016</date>\n    <proposalNumber>2016/0411 (COD)</proposalNumber>\n    <entities>\n        <entity type="author">'
+    )
+    Prediction(
+        rationale='produce the xml. We need to identify the key elements of the document cover page such as the title, date, file number, sender, recipient, and subject. We will then structure this information in the Akoma Ntoso (AKN) format.',
+        xml='```xml\n<coverPage>\n  <title>COUNCIL OF THE EUROPEAN UNION</title>\n  <date>27 February 2017</date>\n  <language>en</language>\n  <fileNumber>2016/0030 (COD)</fileNumber>\n  <sender>General Secretariat of the Council</sender>\n  <recipient>Permanent Representatives Committee</recipient>\n  <subject>Proposal for a REGULATION OF THE EUROPEAN'
+    )
+
+%% Cell type:markdown id: tags:
+
+## Step 5: Optimizing the Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+def load_data_from_json(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+
+def prepare_example(text, expected_xml):
+    # Assuming 'dspy.Example' is the correct class from your DSPy framework
+    example = dspy.Example({
+        'text': text.strip(),  # Using strip() to clean whitespace
+        'expected_xml': expected_xml.strip()
+    }).with_inputs("text")
+    return example
+
+def create_dataset(data):
+    return [prepare_example(item['text'], item['expected_xml']) for item in data.values()]
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
+
+# Load and prepare the dataset
+data = load_data_from_json(file_path)
+trainset = create_dataset(data)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+trainset
+```
+
+%% Output
+
+    [Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>Commission Delegated Directive (EU) 2020/12</p>\n        <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n        <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/24</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>authorising an extension of use of chia seeds (<span class="ITALIC">Salvia hispanica</span>) as a novel food and the change of the conditions of use and the specific labelling requirements of chia seeds (<span class="ITALIC">Salvia hispanica</span>) under Regulation (EU) 2015/2283 of the European Parliament and of the Council and amending Commission Implementing Regulation (EU) 2017/2470</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/23</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>concerning the non-renewal of the approval of the active substance thiacloprid, in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market, and amending the Annex to Commission Implementing Regulation (EU) No 540/2011</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n        <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'})]
+
+%% Cell type:code id: tags:
+
+``` python
+trainset[0]['expected_xml']
+```
+
+%% Output
+
+    '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'
+
+%% Cell type:code id: tags:
+
+``` python
+from rouge_score import rouge_scorer
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def validate_xml_rouge_score(example, pred,trace=None):
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
+    scores = scorer.score(example['expected_xml'], pred)
+
+    # Extracting the F1 scores from the results
+    rouge1_f1 = scores['rouge1'].fmeasure
+    rougeL_f1 = scores['rougeL'].fmeasure
+
+    print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
+
+    # Setting a threshold for ROUGE-L
+    return rougeL_f1 >= 0.0  # Threshold can be adjusted as needed
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dspy.teleprompt import BootstrapFewShot
+
+teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
+compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
+```
+
+%% Output
+
+    100%|██████████| 2/2 [00:00<00:00, 34.16it/s]
+
+    ROUGE-1 F1: 0.430939226519337 | ROUGE-L F1: 0.34254143646408847
+    ROUGE-1 F1: 0.22372881355932203 | ROUGE-L F1: 0.21694915254237288
+    Bootstrapped 2 full traces after 2 examples in round 0.
+
+    
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+!pip install python-dotenv
+!pip install rouge-score
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dotenv import load_dotenv
+import os
+import json
+```
+
+%% Cell type:code id: tags:
+
+``` python
+load_dotenv()
+api_key = os.getenv('OPENAI_API_KEY')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 1: Setup
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(api_key=api_key, model='gpt-3.5-turbo')
+dspy.settings.configure(lm=turbo)
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 2: Define Signatures
+
+%% Cell type:code id: tags:
+
+``` python
+# class Document:
+#     """A simple document class to simulate the expected structure."""
+#     def __init__(self, text):
+#         self.sections = [Section(text)]
+
+# class Section:
+#     """A section of the document."""
+#     def __init__(self, text):
+#         self.text = text
+
+class GenerateAKN(dspy.Signature):
+    """Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
+    text = dspy.InputField(desc="Raw text format of the document cover page")
+    xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input cover page")
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3: Building the Transformation Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+class DocumentToXMLPipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.transform = dspy.ChainOfThought(GenerateAKN)
+
+    def forward(self, text):
+        # Assuming there's some text to process, otherwise return an empty XML structure
+        if not text:
+            return ""
+
+        # Generate XML for the cover page
+        xml_cover_page = self.transform(text=text)
+
+        # Wrap in a root element
+        full_xml = f"<root>{xml_cover_page.xml}</root>"
+        return full_xml
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4: Executing the Pipeline (0-shot conversion without optimization)
+
+%% Cell type:code id: tags:
+
+``` python
+xml_pipeline = DocumentToXMLPipeline()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def process_documents(dataset):
+    cover_pages = [item['plain_text'] for item in dataset]
+    results = []
+
+    for doc in cover_pages:
+        xml_output = xml_pipeline(doc)
+        results.append(xml_output)
+
+    return results
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Running the pipeline with the example dataset
+full_xml_outputs = process_documents(example_dataset)
+for output in full_xml_outputs:
+    print(output)
+```
+
+%% Output
+
+    Prediction(
+        rationale='produce the xml. We will first identify the key elements of the cover page such as the title, date, proposal number, and the entities involved. We will then structure this information in the Akoma Ntoso (AKN) format.',
+        xml='```xml\n<coverPage>\n    <title>Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL amending Regulation (EC) No 1008/2008 on common rules for the operation of air services in the Community</title>\n    <date>21.12.2016</date>\n    <proposalNumber>2016/0411 (COD)</proposalNumber>\n    <entities>\n        <entity type="author">'
+    )
+    Prediction(
+        rationale='produce the xml. We need to identify the key elements of the document cover page such as the title, date, file number, sender, recipient, and subject. We will then structure this information in the Akoma Ntoso (AKN) format.',
+        xml='```xml\n<coverPage>\n  <title>COUNCIL OF THE EUROPEAN UNION</title>\n  <date>27 February 2017</date>\n  <language>en</language>\n  <fileNumber>2016/0030 (COD)</fileNumber>\n  <sender>General Secretariat of the Council</sender>\n  <recipient>Permanent Representatives Committee</recipient>\n  <subject>Proposal for a REGULATION OF THE EUROPEAN'
+    )
+
+%% Cell type:markdown id: tags:
+
+## Step 5: Optimizing the Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+def load_data_from_json(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+
+def prepare_example(text, expected_xml):
+    # Assuming 'dspy.Example' is the correct class from your DSPy framework
+    example = dspy.Example({
+        'text': text.strip(),  # Using strip() to clean whitespace
+        'expected_xml': expected_xml.strip()
+    }).with_inputs("text")
+    return example
+
+def create_dataset(data):
+    return [prepare_example(item['text'], item['expected_xml']) for item in data.values()]
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
+
+# Load and prepare the dataset
+data = load_data_from_json(file_path)
+trainset = create_dataset(data)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+trainset
+```
+
+%% Output
+
+    [Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>Commission Delegated Directive (EU) 2020/12</p>\n        <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n        <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/24</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>authorising an extension of use of chia seeds (<span class="ITALIC">Salvia hispanica</span>) as a novel food and the change of the conditions of use and the specific labelling requirements of chia seeds (<span class="ITALIC">Salvia hispanica</span>) under Regulation (EU) 2015/2283 of the European Parliament and of the Council and amending Commission Implementing Regulation (EU) 2017/2470</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/23</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>concerning the non-renewal of the approval of the active substance thiacloprid, in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market, and amending the Annex to Commission Implementing Regulation (EU) No 540/2011</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n        <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'})]
+
+%% Cell type:code id: tags:
+
+``` python
+trainset[0]['expected_xml']
+```
+
+%% Output
+
+    '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'
+
+%% Cell type:code id: tags:
+
+``` python
+from rouge_score import rouge_scorer
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def validate_xml_rouge_score(example, pred,trace=None):
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
+    scores = scorer.score(example['expected_xml'], pred)
+
+    # Extracting the F1 scores from the results
+    rouge1_f1 = scores['rouge1'].fmeasure
+    rougeL_f1 = scores['rougeL'].fmeasure
+
+    print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
+
+    # Setting a threshold for ROUGE-L
+    return rougeL_f1 >= 0.0  # Threshold can be adjusted as needed
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dspy.teleprompt import BootstrapFewShot
+
+teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
+compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
+```
+
+%% Output
+
+    100%|██████████| 2/2 [00:00<00:00, 34.16it/s]
+
+    ROUGE-1 F1: 0.430939226519337 | ROUGE-L F1: 0.34254143646408847
+    ROUGE-1 F1: 0.22372881355932203 | ROUGE-L F1: 0.21694915254237288
+    Bootstrapped 2 full traces after 2 examples in round 0.
+
+    
--- a/dspy_programs/cover_page.progX.ipynb
+++ b/dspy_programs/cover_page.progX.ipynb
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+```
+
+%% Output
+
+    Collecting dspy-ai
+      Downloading dspy_ai-2.4.12-py3-none-any.whl.metadata (38 kB)
+    Collecting backoff (from dspy-ai)
+      Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
+    Collecting datasets (from dspy-ai)
+      Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
+    Collecting joblib<=1.3.2 (from dspy-ai)
+      Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
+    Collecting openai<2.0.0,>=0.28.1 (from dspy-ai)
+      Downloading openai-1.35.14-py3-none-any.whl.metadata (21 kB)
+    Collecting optuna (from dspy-ai)
+      Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
+    Collecting pandas (from dspy-ai)
+      Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
+    Collecting pydantic~=2.0 (from dspy-ai)
+      Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting regex (from dspy-ai)
+      Downloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting requests (from dspy-ai)
+      Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
+    Collecting structlog (from dspy-ai)
+      Downloading structlog-24.4.0-py3-none-any.whl.metadata (7.3 kB)
+    Collecting tqdm (from dspy-ai)
+      Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting ujson (from dspy-ai)
+      Downloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.3 kB)
+    Collecting anyio<5,>=3.5.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
+    Collecting distro<2,>=1.7.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
+    Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
+    Collecting sniffio (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
+    Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from openai<2.0.0,>=0.28.1->dspy-ai) (4.12.2)
+    Collecting annotated-types>=0.4.0 (from pydantic~=2.0->dspy-ai)
+      Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
+    Collecting pydantic-core==2.20.1 (from pydantic~=2.0->dspy-ai)
+      Downloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.6 kB)
+    Collecting filelock (from datasets->dspy-ai)
+      Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
+    Collecting numpy>=1.17 (from datasets->dspy-ai)
+      Downloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting pyarrow>=15.0.0 (from datasets->dspy-ai)
+      Downloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.3 kB)
+    Collecting pyarrow-hotfix (from datasets->dspy-ai)
+      Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
+    Collecting dill<0.3.9,>=0.3.0 (from datasets->dspy-ai)
+      Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
+    Collecting xxhash (from datasets->dspy-ai)
+      Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
+    Collecting multiprocess (from datasets->dspy-ai)
+      Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
+    Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets->dspy-ai)
+      Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
+    Collecting aiohttp (from datasets->dspy-ai)
+      Downloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.5 kB)
+    Collecting huggingface-hub>=0.21.2 (from datasets->dspy-ai)
+      Downloading huggingface_hub-0.24.0-py3-none-any.whl.metadata (13 kB)
+    Requirement already satisfied: packaging in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from datasets->dspy-ai) (24.1)
+    Collecting pyyaml>=5.1 (from datasets->dspy-ai)
+      Downloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.1 kB)
+    Collecting charset-normalizer<4,>=2 (from requests->dspy-ai)
+      Downloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (33 kB)
+    Collecting idna<4,>=2.5 (from requests->dspy-ai)
+      Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
+    Collecting urllib3<3,>=1.21.1 (from requests->dspy-ai)
+      Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
+    Collecting certifi>=2017.4.17 (from requests->dspy-ai)
+      Downloading certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)
+    Collecting alembic>=1.5.0 (from optuna->dspy-ai)
+      Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
+    Collecting colorlog (from optuna->dspy-ai)
+      Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
+    Collecting sqlalchemy>=1.3.0 (from optuna->dspy-ai)
+      Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.6 kB)
+    Requirement already satisfied: python-dateutil>=2.8.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from pandas->dspy-ai) (2.9.0.post0)
+    Collecting pytz>=2020.1 (from pandas->dspy-ai)
+      Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
+    Collecting tzdata>=2022.7 (from pandas->dspy-ai)
+      Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
+    Collecting Mako (from alembic>=1.5.0->optuna->dspy-ai)
+      Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
+    Requirement already satisfied: exceptiongroup>=1.0.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=0.28.1->dspy-ai) (1.2.2)
+    Collecting aiosignal>=1.1.2 (from aiohttp->datasets->dspy-ai)
+      Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
+    Collecting attrs>=17.3.0 (from aiohttp->datasets->dspy-ai)
+      Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
+    Collecting frozenlist>=1.1.1 (from aiohttp->datasets->dspy-ai)
+      Downloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
+    Collecting multidict<7.0,>=4.5 (from aiohttp->datasets->dspy-ai)
+      Downloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.2 kB)
+    Collecting yarl<2.0,>=1.0 (from aiohttp->datasets->dspy-ai)
+      Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (31 kB)
+    Collecting async-timeout<5.0,>=4.0 (from aiohttp->datasets->dspy-ai)
+      Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
+    Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
+    Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
+    Requirement already satisfied: six>=1.5 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->dspy-ai) (1.16.0)
+    Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna->dspy-ai)
+      Downloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.0 kB)
+    Downloading dspy_ai-2.4.12-py3-none-any.whl (276 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.3/276.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading joblib-1.3.2-py3-none-any.whl (302 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m903.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading openai-1.35.14-py3-none-any.whl (328 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m291.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pydantic-2.8.2-py3-none-any.whl (423 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.9/423.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl (1.7 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
+    Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl (278 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading structlog-24.4.0-py3-none-any.whl (67 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.2/67.2 kB[0m [31m233.3 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
+    [?25hDownloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl (51 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
+    Downloading anyio-4.4.0-py3-none-any.whl (86 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading certifi-2024.7.4-py3-none-any.whl (162 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.0/163.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl (120 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.4/120.4 kB[0m [31m101.0 kB/s[0m eta [36m0:00:00[0m [36m0:00:02[0m
+    [?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
+    Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl (390 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.7/390.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
+    [?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading huggingface_hub-0.24.0-py3-none-any.whl (419 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m419.0/419.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl (5.2 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl (27.2 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.2/27.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
+    [?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl (174 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.4/174.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading sniffio-1.3.1-py3-none-any.whl (10 kB)
+    Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m629.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m720.8 kB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0m
+    [?25hDownloading urllib3-2.2.2-py3-none-any.whl (121 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.4/121.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
+    Downloading filelock-3.15.4-py3-none-any.whl (16 kB)
+    Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
+    Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
+    Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
+    Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
+    Downloading attrs-23.2.0-py3-none-any.whl (60 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl (53 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
+    Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl (81 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl (18 kB)
+    Installing collected packages: pytz, xxhash, urllib3, ujson, tzdata, tqdm, structlog, sqlalchemy, sniffio, regex, pyyaml, pydantic-core, pyarrow-hotfix, numpy, multidict, MarkupSafe, joblib, idna, h11, fsspec, frozenlist, filelock, distro, dill, colorlog, charset-normalizer, certifi, backoff, attrs, async-timeout, annotated-types, yarl, requests, pydantic, pyarrow, pandas, multiprocess, Mako, httpcore, anyio, aiosignal, huggingface-hub, httpx, alembic, aiohttp, optuna, openai, datasets, dspy-ai
+    Successfully installed Mako-1.3.5 MarkupSafe-2.1.5 aiohttp-3.9.5 aiosignal-1.3.1 alembic-1.13.2 annotated-types-0.7.0 anyio-4.4.0 async-timeout-4.0.3 attrs-23.2.0 backoff-2.2.1 certifi-2024.7.4 charset-normalizer-3.3.2 colorlog-6.8.2 datasets-2.20.0 dill-0.3.8 distro-1.9.0 dspy-ai-2.4.12 filelock-3.15.4 frozenlist-1.4.1 fsspec-2024.5.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 huggingface-hub-0.24.0 idna-3.7 joblib-1.3.2 multidict-6.0.5 multiprocess-0.70.16 numpy-2.0.0 openai-1.35.14 optuna-3.6.1 pandas-2.2.2 pyarrow-17.0.0 pyarrow-hotfix-0.6 pydantic-2.8.2 pydantic-core-2.20.1 pytz-2024.1 pyyaml-6.0.1 regex-2024.5.15 requests-2.32.3 sniffio-1.3.1 sqlalchemy-2.0.31 structlog-24.4.0 tqdm-4.66.4 tzdata-2024.1 ujson-5.10.0 urllib3-2.2.2 xxhash-3.4.1 yarl-1.9.4
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(model='gpt-3.5-turbo')
+dspy.settings.configure(lm=turbo)
+```
+
+%% Output
+
+    /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
+      warnings.warn(
+    /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
+      from .autonotebook import tqdm as notebook_tqdm
+
+%% Cell type:code id: tags:
+
+``` python
+class GenerateXML(dspy.Signature):
+    """Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
+
+    section = dspy.InputField(desc="Raw text format of the document cover page")
+    xml = dspy.OutputField(desc="AKN XML representation of the input cover page")
+```
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+```
+
+%% Output
+
+    Collecting dspy-ai
+      Downloading dspy_ai-2.4.12-py3-none-any.whl.metadata (38 kB)
+    Collecting backoff (from dspy-ai)
+      Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
+    Collecting datasets (from dspy-ai)
+      Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
+    Collecting joblib<=1.3.2 (from dspy-ai)
+      Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
+    Collecting openai<2.0.0,>=0.28.1 (from dspy-ai)
+      Downloading openai-1.35.14-py3-none-any.whl.metadata (21 kB)
+    Collecting optuna (from dspy-ai)
+      Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
+    Collecting pandas (from dspy-ai)
+      Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
+    Collecting pydantic~=2.0 (from dspy-ai)
+      Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting regex (from dspy-ai)
+      Downloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting requests (from dspy-ai)
+      Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
+    Collecting structlog (from dspy-ai)
+      Downloading structlog-24.4.0-py3-none-any.whl.metadata (7.3 kB)
+    Collecting tqdm (from dspy-ai)
+      Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting ujson (from dspy-ai)
+      Downloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.3 kB)
+    Collecting anyio<5,>=3.5.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
+    Collecting distro<2,>=1.7.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
+    Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
+    Collecting sniffio (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
+    Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from openai<2.0.0,>=0.28.1->dspy-ai) (4.12.2)
+    Collecting annotated-types>=0.4.0 (from pydantic~=2.0->dspy-ai)
+      Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
+    Collecting pydantic-core==2.20.1 (from pydantic~=2.0->dspy-ai)
+      Downloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.6 kB)
+    Collecting filelock (from datasets->dspy-ai)
+      Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
+    Collecting numpy>=1.17 (from datasets->dspy-ai)
+      Downloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting pyarrow>=15.0.0 (from datasets->dspy-ai)
+      Downloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.3 kB)
+    Collecting pyarrow-hotfix (from datasets->dspy-ai)
+      Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
+    Collecting dill<0.3.9,>=0.3.0 (from datasets->dspy-ai)
+      Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
+    Collecting xxhash (from datasets->dspy-ai)
+      Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
+    Collecting multiprocess (from datasets->dspy-ai)
+      Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
+    Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets->dspy-ai)
+      Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
+    Collecting aiohttp (from datasets->dspy-ai)
+      Downloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.5 kB)
+    Collecting huggingface-hub>=0.21.2 (from datasets->dspy-ai)
+      Downloading huggingface_hub-0.24.0-py3-none-any.whl.metadata (13 kB)
+    Requirement already satisfied: packaging in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from datasets->dspy-ai) (24.1)
+    Collecting pyyaml>=5.1 (from datasets->dspy-ai)
+      Downloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.1 kB)
+    Collecting charset-normalizer<4,>=2 (from requests->dspy-ai)
+      Downloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (33 kB)
+    Collecting idna<4,>=2.5 (from requests->dspy-ai)
+      Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
+    Collecting urllib3<3,>=1.21.1 (from requests->dspy-ai)
+      Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
+    Collecting certifi>=2017.4.17 (from requests->dspy-ai)
+      Downloading certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)
+    Collecting alembic>=1.5.0 (from optuna->dspy-ai)
+      Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
+    Collecting colorlog (from optuna->dspy-ai)
+      Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
+    Collecting sqlalchemy>=1.3.0 (from optuna->dspy-ai)
+      Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.6 kB)
+    Requirement already satisfied: python-dateutil>=2.8.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from pandas->dspy-ai) (2.9.0.post0)
+    Collecting pytz>=2020.1 (from pandas->dspy-ai)
+      Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
+    Collecting tzdata>=2022.7 (from pandas->dspy-ai)
+      Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
+    Collecting Mako (from alembic>=1.5.0->optuna->dspy-ai)
+      Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
+    Requirement already satisfied: exceptiongroup>=1.0.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=0.28.1->dspy-ai) (1.2.2)
+    Collecting aiosignal>=1.1.2 (from aiohttp->datasets->dspy-ai)
+      Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
+    Collecting attrs>=17.3.0 (from aiohttp->datasets->dspy-ai)
+      Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
+    Collecting frozenlist>=1.1.1 (from aiohttp->datasets->dspy-ai)
+      Downloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
+    Collecting multidict<7.0,>=4.5 (from aiohttp->datasets->dspy-ai)
+      Downloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.2 kB)
+    Collecting yarl<2.0,>=1.0 (from aiohttp->datasets->dspy-ai)
+      Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (31 kB)
+    Collecting async-timeout<5.0,>=4.0 (from aiohttp->datasets->dspy-ai)
+      Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
+    Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
+    Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
+    Requirement already satisfied: six>=1.5 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->dspy-ai) (1.16.0)
+    Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna->dspy-ai)
+      Downloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.0 kB)
+    Downloading dspy_ai-2.4.12-py3-none-any.whl (276 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.3/276.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading joblib-1.3.2-py3-none-any.whl (302 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m903.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading openai-1.35.14-py3-none-any.whl (328 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m291.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pydantic-2.8.2-py3-none-any.whl (423 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.9/423.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl (1.7 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
+    Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl (278 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading structlog-24.4.0-py3-none-any.whl (67 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.2/67.2 kB[0m [31m233.3 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
+    [?25hDownloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl (51 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
+    Downloading anyio-4.4.0-py3-none-any.whl (86 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading certifi-2024.7.4-py3-none-any.whl (162 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.0/163.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl (120 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.4/120.4 kB[0m [31m101.0 kB/s[0m eta [36m0:00:00[0m [36m0:00:02[0m
+    [?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
+    Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl (390 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.7/390.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
+    [?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading huggingface_hub-0.24.0-py3-none-any.whl (419 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m419.0/419.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl (5.2 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl (27.2 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.2/27.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
+    [?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl (174 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.4/174.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading sniffio-1.3.1-py3-none-any.whl (10 kB)
+    Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m629.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m720.8 kB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0m
+    [?25hDownloading urllib3-2.2.2-py3-none-any.whl (121 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.4/121.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
+    Downloading filelock-3.15.4-py3-none-any.whl (16 kB)
+    Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
+    Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
+    Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
+    Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
+    Downloading attrs-23.2.0-py3-none-any.whl (60 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl (53 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
+    Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl (81 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl (18 kB)
+    Installing collected packages: pytz, xxhash, urllib3, ujson, tzdata, tqdm, structlog, sqlalchemy, sniffio, regex, pyyaml, pydantic-core, pyarrow-hotfix, numpy, multidict, MarkupSafe, joblib, idna, h11, fsspec, frozenlist, filelock, distro, dill, colorlog, charset-normalizer, certifi, backoff, attrs, async-timeout, annotated-types, yarl, requests, pydantic, pyarrow, pandas, multiprocess, Mako, httpcore, anyio, aiosignal, huggingface-hub, httpx, alembic, aiohttp, optuna, openai, datasets, dspy-ai
+    Successfully installed Mako-1.3.5 MarkupSafe-2.1.5 aiohttp-3.9.5 aiosignal-1.3.1 alembic-1.13.2 annotated-types-0.7.0 anyio-4.4.0 async-timeout-4.0.3 attrs-23.2.0 backoff-2.2.1 certifi-2024.7.4 charset-normalizer-3.3.2 colorlog-6.8.2 datasets-2.20.0 dill-0.3.8 distro-1.9.0 dspy-ai-2.4.12 filelock-3.15.4 frozenlist-1.4.1 fsspec-2024.5.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 huggingface-hub-0.24.0 idna-3.7 joblib-1.3.2 multidict-6.0.5 multiprocess-0.70.16 numpy-2.0.0 openai-1.35.14 optuna-3.6.1 pandas-2.2.2 pyarrow-17.0.0 pyarrow-hotfix-0.6 pydantic-2.8.2 pydantic-core-2.20.1 pytz-2024.1 pyyaml-6.0.1 regex-2024.5.15 requests-2.32.3 sniffio-1.3.1 sqlalchemy-2.0.31 structlog-24.4.0 tqdm-4.66.4 tzdata-2024.1 ujson-5.10.0 urllib3-2.2.2 xxhash-3.4.1 yarl-1.9.4
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(model='gpt-3.5-turbo')
+dspy.settings.configure(lm=turbo)
+```
+
+%% Output
+
+    /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
+      warnings.warn(
+    /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
+      from .autonotebook import tqdm as notebook_tqdm
+
+%% Cell type:code id: tags:
+
+``` python
+class GenerateXML(dspy.Signature):
+    """Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
+
+    section = dspy.InputField(desc="Raw text format of the document cover page")
+    xml = dspy.OutputField(desc="AKN XML representation of the input cover page")
+```
--- a/dspy_programs/dataset_preparation.ipynb
+++ b/dspy_programs/dataset_preparation.ipynb
+%% Cell type:markdown id: tags:
+
+downlod data
+
+%% Cell type:code id: tags:
+
+``` python
+import requests
+import zipfile
+import os
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def download_and_extract_zip(url, extract_to):
+    """
+    Downloads a ZIP file from a URL and extracts it to a specified directory.
+    Only downloads and extracts if the directory does not already contain data.
+    """
+    # Check if the directory already contains data
+    if os.listdir(extract_to):
+        print(f"Data already exists in {extract_to}. Skipping download.")
+        return
+
+    # Make sure the output directory exists
+    os.makedirs(extract_to, exist_ok=True)
+
+    # Get the content from the URL
+    response = requests.get(url)
+    response.raise_for_status()  # Check that the request was successful
+
+    # Path to save the downloaded ZIP file
+    zip_path = os.path.join(extract_to, 'downloaded_files.zip')
+
+    # Write the content to a ZIP file
+    with open(zip_path, 'wb') as f:
+        f.write(response.content)
+
+    # Open the ZIP file and extract its contents
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(extract_to)
+
+    # Optionally, remove the ZIP file after extraction
+    os.remove(zip_path)
+    print(f"Files extracted to {extract_to}")
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+# URL of the file to be downloaded
+s3_url = "https://ai4xml-data.s3.eu-west-1.amazonaws.com/planJO/selection_for_gen4ai/gen4ai_related_files.zip"
+# Directory to store the extracted files
+output_dir = 'data/genai4lex_word_docs/'
+
+download_and_extract_zip(s3_url, output_dir)
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from functions import *
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Specify the folder containing XML files and the output JSON file name
+xml_folder = 'data/genai4lex_word_docs_xml'
+output_json = 'prefaces.json'
+
+extract_preface_content(xml_folder, output_json)
+```
+
+%% Cell type:markdown id: tags:
+
+XML comparaison for test
+replace rouge ??
+
+%% Cell type:code id: tags:
+
+``` python
+!pip install xmldiff
+```
+
+%% Output
+
+    Requirement already satisfied: xmldiff in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (2.7.0)
+    Requirement already satisfied: setuptools in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (58.0.4)
+    Requirement already satisfied: lxml>=3.1.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (5.2.2)
+
+%% Cell type:code id: tags:
+
+``` python
+from xmldiff import main, formatting
+
+def compare_xml_content(xml1, xml2):
+    """ Compare two XML documents and return the differences. """
+    diffs = main.diff_texts(xml1, xml2, formatter=formatting.XMLFormatter())
+    return diffs
+
+# Example XML documents
+xml1 = """<root>
+    <child1 attribute="value1">Text1</child1>
+    <child2>Text2</child2>
+</root>"""
+
+xml2 = """<root>
+    <child1 attribute="value1">Text1</child1>
+    <child2>Text3</child2>  <!-- Changed text -->
+    <child3>New child</child3>  <!-- New element -->
+</root>"""
+
+# Get the difference
+difference = compare_xml_content(xml1, xml2)
+print("Differences:", difference)
+```
+
+%% Output
+
+    Differences: <root xmlns:diff="http://namespaces.shoobx.com/diff">
+        <child1 attribute="value1">Text1</child1>
+        <child2>Text<diff:delete>2</diff:delete><diff:insert>3</diff:insert></child2><diff:delete>
+    </diff:delete><diff:insert>  </diff:insert><child3 diff:insert="">New child</child3><diff:insert>  </diff:insert></root>
+    
+
+%% Cell type:code id: tags:
+
+``` python
+import xml.etree.ElementTree as ET
+import logging
+
+class XmlTree():
+
+    def __init__(self):
+        self.hdlr = logging.FileHandler('xml-comparison.log')
+        self.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+
+    @staticmethod
+    def convert_string_to_tree( xmlString):
+
+        return ET.fromstring(xmlString)
+
+    def xml_compare(self, x1, x2, excludes=[]):
+        """
+        Compares two xml etrees
+        :param x1: the first tree
+        :param x2: the second tree
+        :param excludes: list of string of attributes to exclude from comparison
+        :return:
+            True if both files match
+        """
+
+        if x1.tag != x2.tag:
+            self.logger.debug('Tags do not match: %s and %s' % (x1.tag, x2.tag))
+            return False
+        for name, value in x1.attrib.items():
+            if not name in excludes:
+                if x2.attrib.get(name) != value:
+                    self.logger.debug('Attributes do not match: %s=%r, %s=%r'
+                                 % (name, value, name, x2.attrib.get(name)))
+                    return False
+        for name in x2.attrib.keys():
+            if not name in excludes:
+                if name not in x1.attrib:
+                    self.logger.debug('x2 has an attribute x1 is missing: %s'
+                                 % name)
+                    return False
+        if not self.text_compare(x1.text, x2.text):
+            self.logger.debug('text: %r != %r' % (x1.text, x2.text))
+            return False
+        if not self.text_compare(x1.tail, x2.tail):
+            self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
+            return False
+        cl1 = x1.getchildren()
+        cl2 = x2.getchildren()
+        if len(cl1) != len(cl2):
+            self.logger.debug('children length differs, %i != %i'
+                         % (len(cl1), len(cl2)))
+            return False
+        i = 0
+        for c1, c2 in zip(cl1, cl2):
+            i += 1
+            if not c1.tag in excludes:
+                if not self.xml_compare(c1, c2, excludes):
+                    self.logger.debug('children %i do not match: %s'
+                                 % (i, c1.tag))
+                    return False
+        return True
+
+    def text_compare(self, t1, t2):
+        """
+        Compare two text strings
+        :param t1: text one
+        :param t2: text two
+        :return:
+            True if a match
+        """
+        if not t1 and not t2:
+            return True
+        if t1 == '*' or t2 == '*':
+            return True
+        return (t1 or '').strip() == (t2 or '').strip()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+xml1 = "<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
+
+xml2 = "<note><to>Tove</to><from>Daniel</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
+
+tree1 = XmlTree.convert_string_to_tree(xml1)
+tree2 = XmlTree.convert_string_to_tree(xml2)
+
+comparator = XmlTree()
+
+if comparator.xml_compare(tree1, tree2, ["from"]):
+    print ("XMLs match")
+else:
+    print ("XMLs don't match")
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    AttributeError                            Traceback (most recent call last)
+Cell     In[6], line 10
+          6 tree2 = XmlTree.convert_string_to_tree(xml2)
+          8 comparator = XmlTree()
+    ---> 10 if comparator.xml_compare(tree1, tree2, ["from"]):
+         11     print ("XMLs match")
+         12 else:
+Cell     In[4], line 46, in XmlTree.xml_compare(self, x1, x2, excludes)
+         44     self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
+         45     return False
+    ---> 46 cl1 = x1.getchildren()
+         47 cl2 = x2.getchildren()
+         48 if len(cl1) != len(cl2):
+    AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'getchildren'
+
+%% Cell type:markdown id: tags:
+
+# extract text using pydoc
+
+%% Cell type:code id: tags:
+
+``` python
+# Set the path to your documents and the output JSON file
+root_folder = '/Users/nasredine/dev/work/playground/dspy_programs/data/genai4lex_word_docs_xml'
+output_json = 'md_text_output.json'
+
+process_documents(root_folder, output_json)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Example usage
+xml_data = """<your xml string here>"""
+md_data = """<your markdown string here>"""
+result = extract_and_find(xml_data, md_data)
+print(result)
+```
+%% Cell type:markdown id: tags:
+
+downlod data
+
+%% Cell type:code id: tags:
+
+``` python
+import requests
+import zipfile
+import os
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def download_and_extract_zip(url, extract_to):
+    """
+    Downloads a ZIP file from a URL and extracts it to a specified directory.
+    Only downloads and extracts if the directory does not already contain data.
+    """
+    # Check if the directory already contains data
+    if os.listdir(extract_to):
+        print(f"Data already exists in {extract_to}. Skipping download.")
+        return
+
+    # Make sure the output directory exists
+    os.makedirs(extract_to, exist_ok=True)
+
+    # Get the content from the URL
+    response = requests.get(url)
+    response.raise_for_status()  # Check that the request was successful
+
+    # Path to save the downloaded ZIP file
+    zip_path = os.path.join(extract_to, 'downloaded_files.zip')
+
+    # Write the content to a ZIP file
+    with open(zip_path, 'wb') as f:
+        f.write(response.content)
+
+    # Open the ZIP file and extract its contents
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(extract_to)
+
+    # Optionally, remove the ZIP file after extraction
+    os.remove(zip_path)
+    print(f"Files extracted to {extract_to}")
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+# URL of the file to be downloaded
+s3_url = "https://ai4xml-data.s3.eu-west-1.amazonaws.com/planJO/selection_for_gen4ai/gen4ai_related_files.zip"
+# Directory to store the extracted files
+output_dir = 'data/genai4lex_word_docs/'
+
+download_and_extract_zip(s3_url, output_dir)
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from functions import *
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Specify the folder containing XML files and the output JSON file name
+xml_folder = 'data/genai4lex_word_docs_xml'
+output_json = 'prefaces.json'
+
+extract_preface_content(xml_folder, output_json)
+```
+
+%% Cell type:markdown id: tags:
+
+XML comparaison for test
+replace rouge ??
+
+%% Cell type:code id: tags:
+
+``` python
+!pip install xmldiff
+```
+
+%% Output
+
+    Requirement already satisfied: xmldiff in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (2.7.0)
+    Requirement already satisfied: setuptools in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (58.0.4)
+    Requirement already satisfied: lxml>=3.1.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (5.2.2)
+
+%% Cell type:code id: tags:
+
+``` python
+from xmldiff import main, formatting
+
+def compare_xml_content(xml1, xml2):
+    """ Compare two XML documents and return the differences. """
+    diffs = main.diff_texts(xml1, xml2, formatter=formatting.XMLFormatter())
+    return diffs
+
+# Example XML documents
+xml1 = """<root>
+    <child1 attribute="value1">Text1</child1>
+    <child2>Text2</child2>
+</root>"""
+
+xml2 = """<root>
+    <child1 attribute="value1">Text1</child1>
+    <child2>Text3</child2>  <!-- Changed text -->
+    <child3>New child</child3>  <!-- New element -->
+</root>"""
+
+# Get the difference
+difference = compare_xml_content(xml1, xml2)
+print("Differences:", difference)
+```
+
+%% Output
+
+    Differences: <root xmlns:diff="http://namespaces.shoobx.com/diff">
+        <child1 attribute="value1">Text1</child1>
+        <child2>Text<diff:delete>2</diff:delete><diff:insert>3</diff:insert></child2><diff:delete>
+    </diff:delete><diff:insert>  </diff:insert><child3 diff:insert="">New child</child3><diff:insert>  </diff:insert></root>
+    
+
+%% Cell type:code id: tags:
+
+``` python
+import xml.etree.ElementTree as ET
+import logging
+
+class XmlTree():
+
+    def __init__(self):
+        self.hdlr = logging.FileHandler('xml-comparison.log')
+        self.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+
+    @staticmethod
+    def convert_string_to_tree( xmlString):
+
+        return ET.fromstring(xmlString)
+
+    def xml_compare(self, x1, x2, excludes=[]):
+        """
+        Compares two xml etrees
+        :param x1: the first tree
+        :param x2: the second tree
+        :param excludes: list of string of attributes to exclude from comparison
+        :return:
+            True if both files match
+        """
+
+        if x1.tag != x2.tag:
+            self.logger.debug('Tags do not match: %s and %s' % (x1.tag, x2.tag))
+            return False
+        for name, value in x1.attrib.items():
+            if not name in excludes:
+                if x2.attrib.get(name) != value:
+                    self.logger.debug('Attributes do not match: %s=%r, %s=%r'
+                                 % (name, value, name, x2.attrib.get(name)))
+                    return False
+        for name in x2.attrib.keys():
+            if not name in excludes:
+                if name not in x1.attrib:
+                    self.logger.debug('x2 has an attribute x1 is missing: %s'
+                                 % name)
+                    return False
+        if not self.text_compare(x1.text, x2.text):
+            self.logger.debug('text: %r != %r' % (x1.text, x2.text))
+            return False
+        if not self.text_compare(x1.tail, x2.tail):
+            self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
+            return False
+        cl1 = x1.getchildren()
+        cl2 = x2.getchildren()
+        if len(cl1) != len(cl2):
+            self.logger.debug('children length differs, %i != %i'
+                         % (len(cl1), len(cl2)))
+            return False
+        i = 0
+        for c1, c2 in zip(cl1, cl2):
+            i += 1
+            if not c1.tag in excludes:
+                if not self.xml_compare(c1, c2, excludes):
+                    self.logger.debug('children %i do not match: %s'
+                                 % (i, c1.tag))
+                    return False
+        return True
+
+    def text_compare(self, t1, t2):
+        """
+        Compare two text strings
+        :param t1: text one
+        :param t2: text two
+        :return:
+            True if a match
+        """
+        if not t1 and not t2:
+            return True
+        if t1 == '*' or t2 == '*':
+            return True
+        return (t1 or '').strip() == (t2 or '').strip()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+xml1 = "<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
+
+xml2 = "<note><to>Tove</to><from>Daniel</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
+
+tree1 = XmlTree.convert_string_to_tree(xml1)
+tree2 = XmlTree.convert_string_to_tree(xml2)
+
+comparator = XmlTree()
+
+if comparator.xml_compare(tree1, tree2, ["from"]):
+    print ("XMLs match")
+else:
+    print ("XMLs don't match")
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    AttributeError                            Traceback (most recent call last)
+Cell     In[6], line 10
+          6 tree2 = XmlTree.convert_string_to_tree(xml2)
+          8 comparator = XmlTree()
+    ---> 10 if comparator.xml_compare(tree1, tree2, ["from"]):
+         11     print ("XMLs match")
+         12 else:
+Cell     In[4], line 46, in XmlTree.xml_compare(self, x1, x2, excludes)
+         44     self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
+         45     return False
+    ---> 46 cl1 = x1.getchildren()
+         47 cl2 = x2.getchildren()
+         48 if len(cl1) != len(cl2):
+    AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'getchildren'
+
+%% Cell type:markdown id: tags:
+
+# extract text using pydoc
+
+%% Cell type:code id: tags:
+
+``` python
+# Set the path to your documents and the output JSON file
+root_folder = '/Users/nasredine/dev/work/playground/dspy_programs/data/genai4lex_word_docs_xml'
+output_json = 'md_text_output.json'
+
+process_documents(root_folder, output_json)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Example usage
+xml_data = """<your xml string here>"""
+md_data = """<your markdown string here>"""
+result = extract_and_find(xml_data, md_data)
+print(result)
+```
--- a/dspy_programs/full-doc.prog.ipynb
+++ b/dspy_programs/full-doc.prog.ipynb
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+```
+
+%% Output
+
+    Collecting dspy-ai
+      Downloading dspy_ai-2.4.12-py3-none-any.whl.metadata (38 kB)
+    Collecting backoff (from dspy-ai)
+      Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
+    Collecting datasets (from dspy-ai)
+      Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
+    Collecting joblib<=1.3.2 (from dspy-ai)
+      Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
+    Collecting openai<2.0.0,>=0.28.1 (from dspy-ai)
+      Downloading openai-1.35.14-py3-none-any.whl.metadata (21 kB)
+    Collecting optuna (from dspy-ai)
+      Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
+    Collecting pandas (from dspy-ai)
+      Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
+    Collecting pydantic~=2.0 (from dspy-ai)
+      Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting regex (from dspy-ai)
+      Downloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting requests (from dspy-ai)
+      Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
+    Collecting structlog (from dspy-ai)
+      Downloading structlog-24.4.0-py3-none-any.whl.metadata (7.3 kB)
+    Collecting tqdm (from dspy-ai)
+      Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting ujson (from dspy-ai)
+      Downloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.3 kB)
+    Collecting anyio<5,>=3.5.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
+    Collecting distro<2,>=1.7.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
+    Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
+    Collecting sniffio (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
+    Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from openai<2.0.0,>=0.28.1->dspy-ai) (4.12.2)
+    Collecting annotated-types>=0.4.0 (from pydantic~=2.0->dspy-ai)
+      Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
+    Collecting pydantic-core==2.20.1 (from pydantic~=2.0->dspy-ai)
+      Downloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.6 kB)
+    Collecting filelock (from datasets->dspy-ai)
+      Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
+    Collecting numpy>=1.17 (from datasets->dspy-ai)
+      Downloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting pyarrow>=15.0.0 (from datasets->dspy-ai)
+      Downloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.3 kB)
+    Collecting pyarrow-hotfix (from datasets->dspy-ai)
+      Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
+    Collecting dill<0.3.9,>=0.3.0 (from datasets->dspy-ai)
+      Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
+    Collecting xxhash (from datasets->dspy-ai)
+      Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
+    Collecting multiprocess (from datasets->dspy-ai)
+      Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
+    Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets->dspy-ai)
+      Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
+    Collecting aiohttp (from datasets->dspy-ai)
+      Downloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.5 kB)
+    Collecting huggingface-hub>=0.21.2 (from datasets->dspy-ai)
+      Downloading huggingface_hub-0.24.0-py3-none-any.whl.metadata (13 kB)
+    Requirement already satisfied: packaging in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from datasets->dspy-ai) (24.1)
+    Collecting pyyaml>=5.1 (from datasets->dspy-ai)
+      Downloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.1 kB)
+    Collecting charset-normalizer<4,>=2 (from requests->dspy-ai)
+      Downloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (33 kB)
+    Collecting idna<4,>=2.5 (from requests->dspy-ai)
+      Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
+    Collecting urllib3<3,>=1.21.1 (from requests->dspy-ai)
+      Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
+    Collecting certifi>=2017.4.17 (from requests->dspy-ai)
+      Downloading certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)
+    Collecting alembic>=1.5.0 (from optuna->dspy-ai)
+      Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
+    Collecting colorlog (from optuna->dspy-ai)
+      Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
+    Collecting sqlalchemy>=1.3.0 (from optuna->dspy-ai)
+      Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.6 kB)
+    Requirement already satisfied: python-dateutil>=2.8.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from pandas->dspy-ai) (2.9.0.post0)
+    Collecting pytz>=2020.1 (from pandas->dspy-ai)
+      Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
+    Collecting tzdata>=2022.7 (from pandas->dspy-ai)
+      Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
+    Collecting Mako (from alembic>=1.5.0->optuna->dspy-ai)
+      Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
+    Requirement already satisfied: exceptiongroup>=1.0.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=0.28.1->dspy-ai) (1.2.2)
+    Collecting aiosignal>=1.1.2 (from aiohttp->datasets->dspy-ai)
+      Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
+    Collecting attrs>=17.3.0 (from aiohttp->datasets->dspy-ai)
+      Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
+    Collecting frozenlist>=1.1.1 (from aiohttp->datasets->dspy-ai)
+      Downloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
+    Collecting multidict<7.0,>=4.5 (from aiohttp->datasets->dspy-ai)
+      Downloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.2 kB)
+    Collecting yarl<2.0,>=1.0 (from aiohttp->datasets->dspy-ai)
+      Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (31 kB)
+    Collecting async-timeout<5.0,>=4.0 (from aiohttp->datasets->dspy-ai)
+      Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
+    Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
+    Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
+    Requirement already satisfied: six>=1.5 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->dspy-ai) (1.16.0)
+    Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna->dspy-ai)
+      Downloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.0 kB)
+    Downloading dspy_ai-2.4.12-py3-none-any.whl (276 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.3/276.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading joblib-1.3.2-py3-none-any.whl (302 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m903.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading openai-1.35.14-py3-none-any.whl (328 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m291.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pydantic-2.8.2-py3-none-any.whl (423 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.9/423.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl (1.7 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
+    Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl (278 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading structlog-24.4.0-py3-none-any.whl (67 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.2/67.2 kB[0m [31m233.3 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
+    [?25hDownloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl (51 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
+    Downloading anyio-4.4.0-py3-none-any.whl (86 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading certifi-2024.7.4-py3-none-any.whl (162 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.0/163.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl (120 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.4/120.4 kB[0m [31m101.0 kB/s[0m eta [36m0:00:00[0m [36m0:00:02[0m
+    [?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
+    Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl (390 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.7/390.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
+    [?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading huggingface_hub-0.24.0-py3-none-any.whl (419 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m419.0/419.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl (5.2 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl (27.2 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.2/27.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
+    [?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl (174 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.4/174.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading sniffio-1.3.1-py3-none-any.whl (10 kB)
+    Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m629.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m720.8 kB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0m
+    [?25hDownloading urllib3-2.2.2-py3-none-any.whl (121 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.4/121.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
+    Downloading filelock-3.15.4-py3-none-any.whl (16 kB)
+    Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
+    Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
+    Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
+    Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
+    Downloading attrs-23.2.0-py3-none-any.whl (60 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl (53 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
+    Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl (81 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl (18 kB)
+    Installing collected packages: pytz, xxhash, urllib3, ujson, tzdata, tqdm, structlog, sqlalchemy, sniffio, regex, pyyaml, pydantic-core, pyarrow-hotfix, numpy, multidict, MarkupSafe, joblib, idna, h11, fsspec, frozenlist, filelock, distro, dill, colorlog, charset-normalizer, certifi, backoff, attrs, async-timeout, annotated-types, yarl, requests, pydantic, pyarrow, pandas, multiprocess, Mako, httpcore, anyio, aiosignal, huggingface-hub, httpx, alembic, aiohttp, optuna, openai, datasets, dspy-ai
+    Successfully installed Mako-1.3.5 MarkupSafe-2.1.5 aiohttp-3.9.5 aiosignal-1.3.1 alembic-1.13.2 annotated-types-0.7.0 anyio-4.4.0 async-timeout-4.0.3 attrs-23.2.0 backoff-2.2.1 certifi-2024.7.4 charset-normalizer-3.3.2 colorlog-6.8.2 datasets-2.20.0 dill-0.3.8 distro-1.9.0 dspy-ai-2.4.12 filelock-3.15.4 frozenlist-1.4.1 fsspec-2024.5.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 huggingface-hub-0.24.0 idna-3.7 joblib-1.3.2 multidict-6.0.5 multiprocess-0.70.16 numpy-2.0.0 openai-1.35.14 optuna-3.6.1 pandas-2.2.2 pyarrow-17.0.0 pyarrow-hotfix-0.6 pydantic-2.8.2 pydantic-core-2.20.1 pytz-2024.1 pyyaml-6.0.1 regex-2024.5.15 requests-2.32.3 sniffio-1.3.1 sqlalchemy-2.0.31 structlog-24.4.0 tqdm-4.66.4 tzdata-2024.1 ujson-5.10.0 urllib3-2.2.2 xxhash-3.4.1 yarl-1.9.4
+
+%% Cell type:markdown id: tags:
+
+## Step 1: Setup
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(model='gpt-3.5-turbo')
+dspy.settings.configure(lm=turbo)
+```
+
+%% Output
+
+    /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
+      warnings.warn(
+    /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
+      from .autonotebook import tqdm as notebook_tqdm
+
+%% Cell type:markdown id: tags:
+
+## Step 2: Define Signatures
+
+%% Cell type:code id: tags:
+
+``` python
+class GenerateAKN(dspy.Signature):
+    """Transform word document into Akoma ntoso (AKN) XML format."""
+
+    section = dspy.InputField(desc="Full document that contains title cover-page, body text, articles, etc")
+    xml = dspy.OutputField(desc="XML representation of the input section in AKN format")
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3: Building the Transformation Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+class DocumentToXMLPipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.transform = dspy.ChainOfThought(GenerateAKN)
+
+    def forward(self, document):
+        xml_parts = []
+        for section in document.sections:
+            xml_part = self.transform(section=section.text)
+            xml_parts.append(xml_part.xml)
+        full_xml = "<root>" + "".join(xml_parts) + "</root>"
+        return full_xml
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4: Executing the Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+document = {
+    "sections": [
+        {"title": "Cover Page", "text": "Document Title"},
+        {"title": "Introduction", "text": "Here is the introduction..."},
+        {"title": "Conclusion", "text": "Here is the conclusion..."}
+    ]
+}
+
+xml_pipeline = DocumentToXMLPipeline()
+full_xml_document = xml_pipeline(document)
+print(full_xml_document)
+```
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+```
+
+%% Output
+
+    Collecting dspy-ai
+      Downloading dspy_ai-2.4.12-py3-none-any.whl.metadata (38 kB)
+    Collecting backoff (from dspy-ai)
+      Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
+    Collecting datasets (from dspy-ai)
+      Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
+    Collecting joblib<=1.3.2 (from dspy-ai)
+      Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
+    Collecting openai<2.0.0,>=0.28.1 (from dspy-ai)
+      Downloading openai-1.35.14-py3-none-any.whl.metadata (21 kB)
+    Collecting optuna (from dspy-ai)
+      Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
+    Collecting pandas (from dspy-ai)
+      Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
+    Collecting pydantic~=2.0 (from dspy-ai)
+      Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting regex (from dspy-ai)
+      Downloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting requests (from dspy-ai)
+      Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
+    Collecting structlog (from dspy-ai)
+      Downloading structlog-24.4.0-py3-none-any.whl.metadata (7.3 kB)
+    Collecting tqdm (from dspy-ai)
+      Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting ujson (from dspy-ai)
+      Downloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.3 kB)
+    Collecting anyio<5,>=3.5.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
+    Collecting distro<2,>=1.7.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
+    Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
+    Collecting sniffio (from openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
+    Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from openai<2.0.0,>=0.28.1->dspy-ai) (4.12.2)
+    Collecting annotated-types>=0.4.0 (from pydantic~=2.0->dspy-ai)
+      Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
+    Collecting pydantic-core==2.20.1 (from pydantic~=2.0->dspy-ai)
+      Downloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.6 kB)
+    Collecting filelock (from datasets->dspy-ai)
+      Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
+    Collecting numpy>=1.17 (from datasets->dspy-ai)
+      Downloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
+    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
+    [?25hCollecting pyarrow>=15.0.0 (from datasets->dspy-ai)
+      Downloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.3 kB)
+    Collecting pyarrow-hotfix (from datasets->dspy-ai)
+      Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
+    Collecting dill<0.3.9,>=0.3.0 (from datasets->dspy-ai)
+      Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
+    Collecting xxhash (from datasets->dspy-ai)
+      Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
+    Collecting multiprocess (from datasets->dspy-ai)
+      Downloading multiprocess-0.70.16-py39-none-any.whl.metadata (7.2 kB)
+    Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets->dspy-ai)
+      Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
+    Collecting aiohttp (from datasets->dspy-ai)
+      Downloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.5 kB)
+    Collecting huggingface-hub>=0.21.2 (from datasets->dspy-ai)
+      Downloading huggingface_hub-0.24.0-py3-none-any.whl.metadata (13 kB)
+    Requirement already satisfied: packaging in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from datasets->dspy-ai) (24.1)
+    Collecting pyyaml>=5.1 (from datasets->dspy-ai)
+      Downloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.1 kB)
+    Collecting charset-normalizer<4,>=2 (from requests->dspy-ai)
+      Downloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (33 kB)
+    Collecting idna<4,>=2.5 (from requests->dspy-ai)
+      Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
+    Collecting urllib3<3,>=1.21.1 (from requests->dspy-ai)
+      Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
+    Collecting certifi>=2017.4.17 (from requests->dspy-ai)
+      Downloading certifi-2024.7.4-py3-none-any.whl.metadata (2.2 kB)
+    Collecting alembic>=1.5.0 (from optuna->dspy-ai)
+      Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
+    Collecting colorlog (from optuna->dspy-ai)
+      Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
+    Collecting sqlalchemy>=1.3.0 (from optuna->dspy-ai)
+      Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.6 kB)
+    Requirement already satisfied: python-dateutil>=2.8.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from pandas->dspy-ai) (2.9.0.post0)
+    Collecting pytz>=2020.1 (from pandas->dspy-ai)
+      Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
+    Collecting tzdata>=2022.7 (from pandas->dspy-ai)
+      Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
+    Collecting Mako (from alembic>=1.5.0->optuna->dspy-ai)
+      Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
+    Requirement already satisfied: exceptiongroup>=1.0.2 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from anyio<5,>=3.5.0->openai<2.0.0,>=0.28.1->dspy-ai) (1.2.2)
+    Collecting aiosignal>=1.1.2 (from aiohttp->datasets->dspy-ai)
+      Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
+    Collecting attrs>=17.3.0 (from aiohttp->datasets->dspy-ai)
+      Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
+    Collecting frozenlist>=1.1.1 (from aiohttp->datasets->dspy-ai)
+      Downloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (12 kB)
+    Collecting multidict<7.0,>=4.5 (from aiohttp->datasets->dspy-ai)
+      Downloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.2 kB)
+    Collecting yarl<2.0,>=1.0 (from aiohttp->datasets->dspy-ai)
+      Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (31 kB)
+    Collecting async-timeout<5.0,>=4.0 (from aiohttp->datasets->dspy-ai)
+      Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
+    Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
+    Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai<2.0.0,>=0.28.1->dspy-ai)
+      Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
+    Requirement already satisfied: six>=1.5 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->dspy-ai) (1.16.0)
+    Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna->dspy-ai)
+      Downloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.0 kB)
+    Downloading dspy_ai-2.4.12-py3-none-any.whl (276 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.3/276.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading joblib-1.3.2-py3-none-any.whl (302 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m903.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading openai-1.35.14-py3-none-any.whl (328 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m291.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pydantic-2.8.2-py3-none-any.whl (423 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.9/423.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl (1.7 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
+    Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl (278 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading structlog-24.4.0-py3-none-any.whl (67 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.2/67.2 kB[0m [31m233.3 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
+    [?25hDownloading ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl (51 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
+    Downloading anyio-4.4.0-py3-none-any.whl (86 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading certifi-2024.7.4-py3-none-any.whl (162 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.0/163.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl (120 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.4/120.4 kB[0m [31m101.0 kB/s[0m eta [36m0:00:00[0m [36m0:00:02[0m
+    [?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
+    Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl (390 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.7/390.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
+    [?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading huggingface_hub-0.24.0-py3-none-any.whl (419 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m419.0/419.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading numpy-2.0.0-cp39-cp39-macosx_14_0_arm64.whl (5.2 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl (27.2 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.2/27.2 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
+    [?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl (174 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.4/174.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading sniffio-1.3.1-py3-none-any.whl (10 kB)
+    Downloading SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl (2.1 MB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m629.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
+    [?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m720.8 kB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0m
+    [?25hDownloading urllib3-2.2.2-py3-none-any.whl (121 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.4/121.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
+    Downloading filelock-3.15.4-py3-none-any.whl (16 kB)
+    Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
+    Downloading xxhash-3.4.1-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
+    Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
+    Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
+    Downloading attrs-23.2.0-py3-none-any.whl (60 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl (53 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl (30 kB)
+    Downloading yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl (81 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
+    [?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
+    [2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
+    [?25hDownloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl (18 kB)
+    Installing collected packages: pytz, xxhash, urllib3, ujson, tzdata, tqdm, structlog, sqlalchemy, sniffio, regex, pyyaml, pydantic-core, pyarrow-hotfix, numpy, multidict, MarkupSafe, joblib, idna, h11, fsspec, frozenlist, filelock, distro, dill, colorlog, charset-normalizer, certifi, backoff, attrs, async-timeout, annotated-types, yarl, requests, pydantic, pyarrow, pandas, multiprocess, Mako, httpcore, anyio, aiosignal, huggingface-hub, httpx, alembic, aiohttp, optuna, openai, datasets, dspy-ai
+    Successfully installed Mako-1.3.5 MarkupSafe-2.1.5 aiohttp-3.9.5 aiosignal-1.3.1 alembic-1.13.2 annotated-types-0.7.0 anyio-4.4.0 async-timeout-4.0.3 attrs-23.2.0 backoff-2.2.1 certifi-2024.7.4 charset-normalizer-3.3.2 colorlog-6.8.2 datasets-2.20.0 dill-0.3.8 distro-1.9.0 dspy-ai-2.4.12 filelock-3.15.4 frozenlist-1.4.1 fsspec-2024.5.0 h11-0.14.0 httpcore-1.0.5 httpx-0.27.0 huggingface-hub-0.24.0 idna-3.7 joblib-1.3.2 multidict-6.0.5 multiprocess-0.70.16 numpy-2.0.0 openai-1.35.14 optuna-3.6.1 pandas-2.2.2 pyarrow-17.0.0 pyarrow-hotfix-0.6 pydantic-2.8.2 pydantic-core-2.20.1 pytz-2024.1 pyyaml-6.0.1 regex-2024.5.15 requests-2.32.3 sniffio-1.3.1 sqlalchemy-2.0.31 structlog-24.4.0 tqdm-4.66.4 tzdata-2024.1 ujson-5.10.0 urllib3-2.2.2 xxhash-3.4.1 yarl-1.9.4
+
+%% Cell type:markdown id: tags:
+
+## Step 1: Setup
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(model='gpt-3.5-turbo')
+dspy.settings.configure(lm=turbo)
+```
+
+%% Output
+
+    /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
+      warnings.warn(
+    /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
+      from .autonotebook import tqdm as notebook_tqdm
+
+%% Cell type:markdown id: tags:
+
+## Step 2: Define Signatures
+
+%% Cell type:code id: tags:
+
+``` python
+class GenerateAKN(dspy.Signature):
+    """Transform word document into Akoma ntoso (AKN) XML format."""
+
+    section = dspy.InputField(desc="Full document that contains title cover-page, body text, articles, etc")
+    xml = dspy.OutputField(desc="XML representation of the input section in AKN format")
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3: Building the Transformation Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+class DocumentToXMLPipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.transform = dspy.ChainOfThought(GenerateAKN)
+
+    def forward(self, document):
+        xml_parts = []
+        for section in document.sections:
+            xml_part = self.transform(section=section.text)
+            xml_parts.append(xml_part.xml)
+        full_xml = "<root>" + "".join(xml_parts) + "</root>"
+        return full_xml
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4: Executing the Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+document = {
+    "sections": [
+        {"title": "Cover Page", "text": "Document Title"},
+        {"title": "Introduction", "text": "Here is the introduction..."},
+        {"title": "Conclusion", "text": "Here is the conclusion..."}
+    ]
+}
+
+xml_pipeline = DocumentToXMLPipeline()
+full_xml_document = xml_pipeline(document)
+print(full_xml_document)
+```
--- a/dspy_programs/functions.py
+++ b/dspy_programs/functions.py
+import re
+import os
+import json
+import subprocess
+
+import xml.etree.ElementTree as ET
+
+import requests
+import zipfile
+
+
+def normalize_text(text):
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
+    return text.lower().strip()  # Convert to lowercase and strip leading/trailing spaces
+
+def extract_and_find(xml_string, md_string, length=30):
+    # Extract from XML
+    normalized_xml = normalize_text(xml_string)
+    start_segment = normalized_xml[:length]
+    end_segment = normalized_xml[-length:]
+    
+    # Normalize MD
+    normalized_md = normalize_text(md_string)
+    
+    # Find start and end segments in MD
+    start_index = normalized_md.find(start_segment)
+    end_index = normalized_md.rfind(end_segment)
+    
+    if start_index != -1 and end_index != -1 and start_index < end_index:
+        return f"Text likely spans from index {start_index} to {end_index + length} in the Markdown file."
+    return "Matching text not found in Markdown."
+
+
+
+
+def remove_namespaces(xml_element):
+    """ Recursively remove namespace prefixes from an XML element and its children. """
+    for elem in xml_element.iter():
+        if '}' in elem.tag:
+            elem.tag = elem.tag.split('}', 1)[1]  # Removes namespace
+        # Update attributes to remove namespaces
+        attributes = list(elem.attrib.keys())
+        for attr in attributes:
+            if '}' in attr:
+                new_attr = attr.split('}', 1)[1]
+                elem.attrib[new_attr] = elem.attrib.pop(attr)
+
+def remove_namespaces(xml_element):
+    """ Recursively remove namespace prefixes from an XML element and its children. """
+    for elem in xml_element.iter():
+        if '}' in elem.tag:
+            elem.tag = elem.tag.split('}', 1)[1]  # Removes namespace
+        # Update attributes to remove namespaces
+        attributes = list(elem.attrib.keys())
+        for attr in attributes:
+            if '}' in attr:
+                new_attr = attr.split('}', 1)[1]
+                elem.attrib[new_attr] = elem.attrib.pop(attr)
+
+def extract_preface_content(xml_folder, output_json):
+    results = {}
+
+    # Iterate over every XML file in the folder and its subfolders
+    for root_dir, sub_dirs, files in os.walk(xml_folder):
+        for filename in files:
+            if filename.endswith('.xml'):
+                # Remove the file extension from the filename
+                filename_no_ext = os.path.splitext(filename)[0]
+                file_path = os.path.join(root_dir, filename)
+                
+                # Parse the XML file
+                tree = ET.parse(file_path)
+                root = tree.getroot()
+                
+                # Remove namespaces from the root element
+                remove_namespaces(root)
+                
+                # Extract the entire <preface> element
+                preface = root.find('.//preface')
+                if preface is not None:
+                    # Convert the <preface> element to a string including its content
+                    preface_xml = ET.tostring(preface, encoding='unicode')
+                    results[filename_no_ext] = {
+                        'celex_id': filename_no_ext,
+                        'expected_xml': preface_xml,
+                        'text': ""
+                    }
+                else:
+                    results[filename_no_ext] = {
+                        'celex_id': filename_no_ext,
+                        'expected_xml': "No preface found",
+                        'text': ""
+                    }
+
+    # Write results to a JSON file
+    with open(output_json, 'w') as json_file:
+        json.dump(results, json_file, indent=4)
+        json.dump(results, json_file, indent=4)
+
+
+
+def convert_docx_to_md(docx_path):
+    """Convert a DOCX file to Markdown using Pandoc."""
+    try:
+        # Run pandoc to convert the docx file to markdown
+        result = subprocess.run(['pandoc', '-f', 'docx', '-t', 'markdown', docx_path], capture_output=True, text=True)
+        return result.stdout
+    except subprocess.CalledProcessError as e:
+        print("An error occurred while converting DOCX to Markdown:", e)
+        return None
+
+def process_documents(root_folder, output_json):
+    results = {}
+    # Walk through the directory structure
+    for root, dirs, files in os.walk(root_folder):
+        for file in files:
+            if file.endswith('.docx'):
+                celex_id = os.path.basename(root)  # Assuming the parent folder is the CELEX ID
+                docx_path = os.path.join(root, file)
+                markdown_text = convert_docx_to_md(docx_path)
+                if markdown_text:
+                    results[celex_id] = markdown_text
+
+    # Save results to a JSON file
+    with open(output_json, 'w') as json_file:
+        json.dump(results, json_file, indent=4)
+
+def download_and_extract_zip(script_dir, zip_url):
+    print("The 'Documents' folder does not exist. Downloading and extracting the zip file...")
+    
+    # Download the zip file
+    zip_file = os.path.join(script_dir, "Documents.zip")
+    response = requests.get(zip_url)
+    
+    with open(zip_file, "wb") as f:
+        f.write(response.content)
+    
+    # Extract the zip file
+    with zipfile.ZipFile(zip_file, "r") as zip_ref:
+        zip_ref.extractall(script_dir)
+    
+    # Remove the downloaded zip file
+    os.remove(zip_file)
\ No newline at end of file
--- a/dspy_programs/preface.prog.ipynb
+++ b/dspy_programs/preface.prog.ipynb
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+!pip install python-dotenv
+!pip install rouge-score
+```
+
+%% Output
+
+    13008.28s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
+
+    Requirement already satisfied: rouge-score in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (0.1.2)
+    Requirement already satisfied: absl-py in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (2.1.0)
+    Requirement already satisfied: nltk in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (3.8.1)
+    Requirement already satisfied: numpy in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (2.0.0)
+    Requirement already satisfied: six>=1.14.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (1.16.0)
+    Requirement already satisfied: click in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (8.1.7)
+    Requirement already satisfied: joblib in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (1.3.2)
+    Requirement already satisfied: regex>=2021.8.3 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (2024.5.15)
+    Requirement already satisfied: tqdm in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (4.66.4)
+
+%% Cell type:code id: tags:
+
+``` python
+from dotenv import load_dotenv
+import os
+import json
+import re
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+load_dotenv()
+api_key = os.getenv('OPENAI_API_KEY')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 1: Setup
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(api_key=api_key, model='gpt-4o-mini', max_tokens=10000)
+dspy.settings.configure(lm=turbo)
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 2: Define Signatures
+
+%% Cell type:code id: tags:
+
+``` python
+# class Document:
+#     """A simple document class to simulate the expected structure."""
+#     def __init__(self, text):
+#         self.sections = [Section(text)]
+
+# class Section:
+#     """A section of the document."""
+#     def __init__(self, text):
+#         self.text = text
+
+class GenerateAKN(dspy.Signature):
+    """Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format."""
+    text = dspy.InputField(desc="Raw text format of the document prefece section")
+    xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input preface")
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3: Building the Transformation Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+class DocumentToXMLPipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.transform = dspy.ChainOfThought(GenerateAKN)
+
+    def extract_xml(self, content):
+        # This pattern looks for the <preface> tag and captures until </preface> including newlines and any characters between.
+        match = re.search(r'<preface>.*?</preface>', content, re.DOTALL)
+        if match:
+            return match.group(0)
+        return ""  # Return empty string if no XML part is found
+
+    def forward(self, text):
+        # Assuming there's some text to process, otherwise return an empty XML structure
+        if not text:
+            return ""
+
+        # Generate XML for the cover page
+        xml_preface = self.transform(text=text)
+
+        # Extract the desired XML part
+        extracted_xml = self.extract_xml(xml_preface.xml)
+
+        # Return the extracted XML
+        return extracted_xml
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4: Executing the Pipeline (0-shot conversion without optimization)
+
+%% Cell type:code id: tags:
+
+``` python
+xml_pipeline = DocumentToXMLPipeline()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def process_documents(dataset):
+    preface = [item['plain_text'] for item in dataset]
+    results = []
+
+    for doc in preface:
+        xml_output = xml_pipeline(doc)
+        results.append(xml_output)
+
+    return results
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Running the pipeline with the example dataset
+# full_xml_outputs = process_documents(example_dataset)
+# for output in full_xml_outputs:
+#     print(output)
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 5: Optimizing the Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+def load_data_from_json(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+
+def prepare_example(text, xml):
+    # Assuming 'dspy.Example' is the correct class from your DSPy framework
+    example = dspy.Example({
+        'text': text.strip(),  # Using strip() to clean whitespace
+        'xml': xml.strip()
+    }).with_inputs("text")
+    return example
+
+def create_dataset(data):
+    return [prepare_example(item['text'], item['xml']) for item in data.values()]
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
+
+# Load and prepare the dataset
+data = load_data_from_json(file_path)
+dataset = create_dataset(data)
+trainset = create_dataset(data)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+trainset
+```
+
+%% Output
+
+    [Example({'text': '.\n\nCommission Delegated Regulation (EU) 2020/...\n\nof 29 October 2019\n\namending Regulation (EC) No 1272/2008 of the European Parliament and of\nthe Council on classification, labelling and packaging of substances and\nmixtures as regards information relating to emergency health response\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 14 January 2020\n\nconcerning the non-approval of *Vitis vinifera* cane tannins as a basic\nsubstance in accordance with Regulation (EC) No 1107/2009 of the\nEuropean Parliament and of the Council concerning the placing of plant\nprotection products on the market\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'Commission Delegated Directive (EU) 2019/...\n\nof 2 August 2019\n\nsupplementing Directive (EU) 2017/2397 of the European Parliament and of\nthe Council as regards the standards for competences and corresponding\nknowledge and skills, for the practical examinations, for the approval\nof simulators and for medical fitness\n\n(Text with EEA relevance)', 'xml': '<preface>\n      <longTitle>\n        <p>Commission Delegated Directive (EU) 2020/12</p>\n        <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n        <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'UN Regulation No 53 --- Uniform provisions concerning the approval of\ncategory L~3~ vehicles with regard to the installation of lighting and\nlight-signalling devices \\[2019/\\...\\]**', 'xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'COMMISSION DELEGATED REGULATION (EU) 2020/...\n\nof [31 October 2019]{.mark}\n\namending Annexes I and III to Regulation (EU) 2019/631 of the European\nParliament and of the Council as regards the monitoring of CO~2~\nemissions from new light commercial vehicles type-approved in a\nmulti-stage process\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n        <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 13 January 2020\n\namending and correcting Regulation (EC) No 1235/2008 laying down\ndetailed rules for implementation of Council Regulation (EC) No 834/2007\nas regards the arrangements for imports of organic products from third\ncountries\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 14 January 2020\n\namending Implementing Regulation (EU) No 404/2011 as regards detailed\nrules for the direct electronic exchange of information enacted under\nthe rules of the Common Fisheries Policy', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '**UN Regulation No 74 --- Uniform provisions concerning the approval of\ncategory L~1~ vehicles with regard to the installation of lighting and\nlight-signalling devices \\[2019/...\\]**', 'xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'})]
+
+%% Cell type:code id: tags:
+
+``` python
+trainset[0]['xml']
+```
+
+%% Output
+
+    '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'
+
+%% Cell type:code id: tags:
+
+``` python
+from rouge_score import rouge_scorer
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def validate_xml_rouge_score(example, pred,trace=None):
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
+    scores = scorer.score(example['xml'], pred)
+
+    # Extracting the F1 scores from the results
+    rouge1_f1 = scores['rouge1'].fmeasure
+    rougeL_f1 = scores['rougeL'].fmeasure
+
+    print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
+
+    # Setting a threshold for ROUGE-L
+    return rougeL_f1 >= 0.7  # Threshold can be adjusted as needed
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dspy.teleprompt import BootstrapFewShot
+
+teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
+compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
+```
+
+%% Output
+
+     50%|█████     | 4/8 [00:00<00:00, 187.45it/s]
+
+    ROUGE-1 F1: 0.9857142857142858 | ROUGE-L F1: 0.9857142857142858
+    ROUGE-1 F1: 0.9565217391304348 | ROUGE-L F1: 0.9565217391304348
+    ROUGE-1 F1: 0.9305555555555556 | ROUGE-L F1: 0.9305555555555556
+    ROUGE-1 F1: 0.9629629629629629 | ROUGE-L F1: 0.9629629629629629
+    Bootstrapped 4 full traces after 5 examples in round 0.
+
+    
+
+%% Cell type:code id: tags:
+
+``` python
+compiled_pipeline.save("prefaces.prog.json")
+```
+
+%% Output
+
+    [('transform', Predict(StringSignature(text -> rationale, xml
+        instructions='Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format.'
+        text = Field(annotation=str required=True json_schema_extra={'desc': 'Raw text format of the document prefece section', '__dspy_field_type': 'input', 'prefix': 'Text:'})
+        rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the xml}. We ...', '__dspy_field_type': 'output'})
+        xml = Field(annotation=str required=True json_schema_extra={'desc': 'Akoma Ntoso (AKN) XML representation of the input preface', '__dspy_field_type': 'output', 'prefix': 'Xml:'})
+    )))]
+
+%% Cell type:code id: tags:
+
+``` python
+text = "COMMISSION IMPLEMENTING REGULATION (EU) 2021/...\n\nof 13 January 2021\n\namending and correcting Regulation (EC) No 1235/2009 laying down\ndetailed rules for implementation of Council Regulation (EC) No 834/2008\nas regards the arrangements for imports of electrical products from third\ncountries\n\n**(Text with EEA relevance)**\n\n"
+```
+
+%% Cell type:code id: tags:
+
+``` python
+xml = compiled_pipeline(text)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+xml
+```
+
+%% Output
+
+    '<preface>\n    <longTitle>\n        <p>\n            <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2021/...</docNumber></span>\n        </p>\n        <p>of<docDate date="2021-01-13">13 January 2021</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2009 laying down detailed rules for implementation of Council Regulation (EC) No 834/2008 as regards the arrangements for imports of electrical products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n    </longTitle>\n</preface>'
+
+%% Cell type:markdown id: tags:
+
+'```xml\n<preface>\n    <longTitle>\n        <p>\n            <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2021/...</docNumber></span>\n        </p>\n        <p>of<docDate date="2021-01-13">13 January 2021</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2009 laying down detailed rules for implementation of Council Regulation (EC) No 834/2008 as regards the arrangements for imports of electrical products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n    </longTitle>\n</preface>\n```'
+
+
+
+
+
+
+%% Cell type:markdown id: tags:
+
+<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n        </p>\n        <p>of<docDate date=\"2020-01-13\">13 January 2020</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>\n
+
+
+%% Cell type:code id: tags:
+
+``` python
+compiled_pipeline
+```
+
+%% Output
+
+    transform = Predict(StringSignature(text -> rationale, xml
+        instructions='Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format.'
+        text = Field(annotation=str required=True json_schema_extra={'desc': 'Raw text format of the document prefece section', '__dspy_field_type': 'input', 'prefix': 'Text:'})
+        rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the xml}. We ...', '__dspy_field_type': 'output'})
+        xml = Field(annotation=str required=True json_schema_extra={'desc': 'Akoma Ntoso (AKN) XML representation of the input preface', '__dspy_field_type': 'output', 'prefix': 'Xml:'})
+    ))
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+!pip install python-dotenv
+!pip install rouge-score
+```
+
+%% Output
+
+    13008.28s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
+
+    Requirement already satisfied: rouge-score in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (0.1.2)
+    Requirement already satisfied: absl-py in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (2.1.0)
+    Requirement already satisfied: nltk in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (3.8.1)
+    Requirement already satisfied: numpy in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (2.0.0)
+    Requirement already satisfied: six>=1.14.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from rouge-score) (1.16.0)
+    Requirement already satisfied: click in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (8.1.7)
+    Requirement already satisfied: joblib in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (1.3.2)
+    Requirement already satisfied: regex>=2021.8.3 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (2024.5.15)
+    Requirement already satisfied: tqdm in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from nltk->rouge-score) (4.66.4)
+
+%% Cell type:code id: tags:
+
+``` python
+from dotenv import load_dotenv
+import os
+import json
+import re
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+load_dotenv()
+api_key = os.getenv('OPENAI_API_KEY')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 1: Setup
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(api_key=api_key, model='gpt-4o-mini', max_tokens=10000)
+dspy.settings.configure(lm=turbo)
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 2: Define Signatures
+
+%% Cell type:code id: tags:
+
+``` python
+# class Document:
+#     """A simple document class to simulate the expected structure."""
+#     def __init__(self, text):
+#         self.sections = [Section(text)]
+
+# class Section:
+#     """A section of the document."""
+#     def __init__(self, text):
+#         self.text = text
+
+class GenerateAKN(dspy.Signature):
+    """Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format."""
+    text = dspy.InputField(desc="Raw text format of the document prefece section")
+    xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input preface")
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3: Building the Transformation Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+class DocumentToXMLPipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.transform = dspy.ChainOfThought(GenerateAKN)
+
+    def extract_xml(self, content):
+        # This pattern looks for the <preface> tag and captures until </preface> including newlines and any characters between.
+        match = re.search(r'<preface>.*?</preface>', content, re.DOTALL)
+        if match:
+            return match.group(0)
+        return ""  # Return empty string if no XML part is found
+
+    def forward(self, text):
+        # Assuming there's some text to process, otherwise return an empty XML structure
+        if not text:
+            return ""
+
+        # Generate XML for the cover page
+        xml_preface = self.transform(text=text)
+
+        # Extract the desired XML part
+        extracted_xml = self.extract_xml(xml_preface.xml)
+
+        # Return the extracted XML
+        return extracted_xml
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4: Executing the Pipeline (0-shot conversion without optimization)
+
+%% Cell type:code id: tags:
+
+``` python
+xml_pipeline = DocumentToXMLPipeline()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def process_documents(dataset):
+    preface = [item['plain_text'] for item in dataset]
+    results = []
+
+    for doc in preface:
+        xml_output = xml_pipeline(doc)
+        results.append(xml_output)
+
+    return results
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Running the pipeline with the example dataset
+# full_xml_outputs = process_documents(example_dataset)
+# for output in full_xml_outputs:
+#     print(output)
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 5: Optimizing the Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+def load_data_from_json(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+
+def prepare_example(text, xml):
+    # Assuming 'dspy.Example' is the correct class from your DSPy framework
+    example = dspy.Example({
+        'text': text.strip(),  # Using strip() to clean whitespace
+        'xml': xml.strip()
+    }).with_inputs("text")
+    return example
+
+def create_dataset(data):
+    return [prepare_example(item['text'], item['xml']) for item in data.values()]
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
+
+# Load and prepare the dataset
+data = load_data_from_json(file_path)
+dataset = create_dataset(data)
+trainset = create_dataset(data)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+trainset
+```
+
+%% Output
+
+    [Example({'text': '.\n\nCommission Delegated Regulation (EU) 2020/...\n\nof 29 October 2019\n\namending Regulation (EC) No 1272/2008 of the European Parliament and of\nthe Council on classification, labelling and packaging of substances and\nmixtures as regards information relating to emergency health response\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 14 January 2020\n\nconcerning the non-approval of *Vitis vinifera* cane tannins as a basic\nsubstance in accordance with Regulation (EC) No 1107/2009 of the\nEuropean Parliament and of the Council concerning the placing of plant\nprotection products on the market\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'Commission Delegated Directive (EU) 2019/...\n\nof 2 August 2019\n\nsupplementing Directive (EU) 2017/2397 of the European Parliament and of\nthe Council as regards the standards for competences and corresponding\nknowledge and skills, for the practical examinations, for the approval\nof simulators and for medical fitness\n\n(Text with EEA relevance)', 'xml': '<preface>\n      <longTitle>\n        <p>Commission Delegated Directive (EU) 2020/12</p>\n        <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n        <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'UN Regulation No 53 --- Uniform provisions concerning the approval of\ncategory L~3~ vehicles with regard to the installation of lighting and\nlight-signalling devices \\[2019/\\...\\]**', 'xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'COMMISSION DELEGATED REGULATION (EU) 2020/...\n\nof [31 October 2019]{.mark}\n\namending Annexes I and III to Regulation (EU) 2019/631 of the European\nParliament and of the Council as regards the monitoring of CO~2~\nemissions from new light commercial vehicles type-approved in a\nmulti-stage process\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n        <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 13 January 2020\n\namending and correcting Regulation (EC) No 1235/2008 laying down\ndetailed rules for implementation of Council Regulation (EC) No 834/2007\nas regards the arrangements for imports of organic products from third\ncountries\n\n**(Text with EEA relevance)**', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': 'COMMISSION IMPLEMENTING REGULATION (EU) 2020/...\n\nof 14 January 2020\n\namending Implementing Regulation (EU) No 404/2011 as regards detailed\nrules for the direct electronic exchange of information enacted under\nthe rules of the Common Fisheries Policy', 'xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '**UN Regulation No 74 --- Uniform provisions concerning the approval of\ncategory L~1~ vehicles with regard to the installation of lighting and\nlight-signalling devices \\[2019/...\\]**', 'xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'})]
+
+%% Cell type:code id: tags:
+
+``` python
+trainset[0]['xml']
+```
+
+%% Output
+
+    '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'
+
+%% Cell type:code id: tags:
+
+``` python
+from rouge_score import rouge_scorer
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def validate_xml_rouge_score(example, pred,trace=None):
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
+    scores = scorer.score(example['xml'], pred)
+
+    # Extracting the F1 scores from the results
+    rouge1_f1 = scores['rouge1'].fmeasure
+    rougeL_f1 = scores['rougeL'].fmeasure
+
+    print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
+
+    # Setting a threshold for ROUGE-L
+    return rougeL_f1 >= 0.7  # Threshold can be adjusted as needed
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dspy.teleprompt import BootstrapFewShot
+
+teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
+compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
+```
+
+%% Output
+
+     50%|█████     | 4/8 [00:00<00:00, 187.45it/s]
+
+    ROUGE-1 F1: 0.9857142857142858 | ROUGE-L F1: 0.9857142857142858
+    ROUGE-1 F1: 0.9565217391304348 | ROUGE-L F1: 0.9565217391304348
+    ROUGE-1 F1: 0.9305555555555556 | ROUGE-L F1: 0.9305555555555556
+    ROUGE-1 F1: 0.9629629629629629 | ROUGE-L F1: 0.9629629629629629
+    Bootstrapped 4 full traces after 5 examples in round 0.
+
+    
+
+%% Cell type:code id: tags:
+
+``` python
+compiled_pipeline.save("prefaces.prog.json")
+```
+
+%% Output
+
+    [('transform', Predict(StringSignature(text -> rationale, xml
+        instructions='Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format.'
+        text = Field(annotation=str required=True json_schema_extra={'desc': 'Raw text format of the document prefece section', '__dspy_field_type': 'input', 'prefix': 'Text:'})
+        rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the xml}. We ...', '__dspy_field_type': 'output'})
+        xml = Field(annotation=str required=True json_schema_extra={'desc': 'Akoma Ntoso (AKN) XML representation of the input preface', '__dspy_field_type': 'output', 'prefix': 'Xml:'})
+    )))]
+
+%% Cell type:code id: tags:
+
+``` python
+text = "COMMISSION IMPLEMENTING REGULATION (EU) 2021/...\n\nof 13 January 2021\n\namending and correcting Regulation (EC) No 1235/2009 laying down\ndetailed rules for implementation of Council Regulation (EC) No 834/2008\nas regards the arrangements for imports of electrical products from third\ncountries\n\n**(Text with EEA relevance)**\n\n"
+```
+
+%% Cell type:code id: tags:
+
+``` python
+xml = compiled_pipeline(text)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+xml
+```
+
+%% Output
+
+    '<preface>\n    <longTitle>\n        <p>\n            <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2021/...</docNumber></span>\n        </p>\n        <p>of<docDate date="2021-01-13">13 January 2021</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2009 laying down detailed rules for implementation of Council Regulation (EC) No 834/2008 as regards the arrangements for imports of electrical products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n    </longTitle>\n</preface>'
+
+%% Cell type:markdown id: tags:
+
+'```xml\n<preface>\n    <longTitle>\n        <p>\n            <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2021/...</docNumber></span>\n        </p>\n        <p>of<docDate date="2021-01-13">13 January 2021</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2009 laying down detailed rules for implementation of Council Regulation (EC) No 834/2008 as regards the arrangements for imports of electrical products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n    </longTitle>\n</preface>\n```'
+
+
+
+
+
+
+%% Cell type:markdown id: tags:
+
+<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n        </p>\n        <p>of<docDate date=\"2020-01-13\">13 January 2020</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>\n
+
+
+%% Cell type:code id: tags:
+
+``` python
+compiled_pipeline
+```
+
+%% Output
+
+    transform = Predict(StringSignature(text -> rationale, xml
+        instructions='Create an XML representation of a document preface section in the Akoma Ntoso (AKN) format.'
+        text = Field(annotation=str required=True json_schema_extra={'desc': 'Raw text format of the document prefece section', '__dspy_field_type': 'input', 'prefix': 'Text:'})
+        rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the xml}. We ...', '__dspy_field_type': 'output'})
+        xml = Field(annotation=str required=True json_schema_extra={'desc': 'Akoma Ntoso (AKN) XML representation of the input preface', '__dspy_field_type': 'output', 'prefix': 'Xml:'})
+    ))
No results found