Compare revisions

Nasredine · Nasredine · Nasredine · my_mac_access_token · a3fa8bbb · a3fa8bbb
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,6 @@ analyze_akn_datasets/__pycache__
 Conval_API/ids.csv
 prompt_engineering_experiments/evaluation_results/
 prompt_engineering_experiments/data/*
+dspy_programs/data
+.DS_Store
+
--- a/analyze_akn_datasets/analyze_eur-lex_genai4lex_dataset.ipynb
+++ b/analyze_akn_datasets/analyze_eur-lex_genai4lex_dataset.ipynb
 %% Cell type:code id: tags:

 ``` python
 !pip install tqdm
 !pip install lxml
 ```

 %% Output

    Requirement already satisfied: tqdm in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (4.66.2)
    Requirement already satisfied: lxml in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (5.2.2)

 %% Cell type:code id: tags:

 ``` python
 import os
 from functions import *
 ```

 %% Cell type:code id: tags:

 ``` python
 cwd = os.getcwd()
 documents_dir = os.path.join(f'{cwd}/data/genai4lex', 'Documents')
 results_dir = os.path.join(cwd, 'results/genai4lex')
 ```

 %% Cell type:markdown id: tags:

 ### download AKN documents from genai4lex repo

 %% Cell type:code id: tags:

 ``` python
 # Create results_dir if it does not exist
 if not os.path.isdir(results_dir):
    os.makedirs(results_dir)
 ```

 %% Cell type:code id: tags:

 ``` python

 # Check if the 'Documents' folder exists, if not, download and extract the zip file
 if not os.path.isdir(documents_dir):
    zip_url = "https://gitlab.com/CIRSFID/genai4lex/-/raw/main/LegalResources/Eur-Lex/2010-2021/Documents.zip?inline=false"
    download_and_extract_zip(cwd, zip_url)  # Ensure this function is defined to handle download and extraction

 if not os.path.isdir(documents_dir):
    print("Invalid directory path.")
    exit()
 ```

 %% Cell type:markdown id: tags:

 ### download AKN schema

 %% Cell type:code id: tags:

 ``` python

 schema_dir = os.path.join(cwd, 'schema')
 os.makedirs(schema_dir, exist_ok=True)

 # Download Akoma Ntoso Schema
 schema_url = "https://docs.oasis-open.org/legaldocml/akn-core/v1.0/os/part2-specs/schemas/akomantoso30.xsd"
 schema_path = os.path.join(schema_dir, 'akomantoso30.xsd')
 if not os.path.exists(schema_path):
    download_schema(schema_url, schema_path)
 ```

 %% Cell type:markdown id: tags:

 ### Analysis and statistics

 %% Cell type:code id: tags:

 ``` python
-
 # Assuming analyze_xml_files and the associated functions are defined elsewhere
 results, stats = analyze_xml_files(documents_dir,schema_path)
-
 ```

 %% Output

    Analyzing XML files: 100%|██████████| 15283/15283 [00:22<00:00, 674.22it/s]

 %% Cell type:code id: tags:

 ``` python
 output_csv_path = os.path.join(results_dir, 'results.csv')

 # Write results to CSV
 write_results_to_csv(results, output_csv_path)  # Ensure this function is defined


 stats_file_path = os.path.join(results_dir, 'statistics.csv')

 write_stats_to_file(stats, stats_file_path)  # Ensure this function is defined
 ```

 %% Cell type:code id: tags:

 ``` python
 print(stats)
 ```

 %% Output

    {'Average Total Pages': 4.1452594385919, 'Missing Total Pages': 0, 'Missing OJ Number': 0, 'Missing Publication Date': 0, 'Earliest Publication Date': '2010-01-05', 'Latest Publication Date': '2021-08-09'}

 %% Cell type:markdown id: tags:

 ## Schema validation

 %% Cell type:markdown id: tags:

 ### validate documents

 %% Cell type:code id: tags:

 ``` python
 results, valid, invalid = validate_xml_files(documents_dir, schema_path)
 ```

 %% Output

    Validating XML files: 100%|██████████| 15283/15283 [00:16<00:00, 934.79file/s]

 %% Cell type:code id: tags:

 ``` python
 output_csv_path = os.path.join(results_dir, 'validation_results.csv')
 write_results_to_csv2(results, output_csv_path)
 print(f"Validation results have been written to {output_csv_path}")
 ```

 %% Output

    Validation results have been written to /home/nasredine/dev/work/ai4xml/playground/analyze_akn_datasets/results/genai4lex/validation_results.csv

 %% Cell type:code id: tags:

 ``` python
 print(f'Number of valid files {valid}')
 print(f'Number of invalid files {invalid}')
 ```

 %% Output

    Number of valid files 12759
    Number of invalid files 2524

 %% Cell type:code id: tags:

 ``` python
 !pip install tqdm
 !pip install lxml
 ```

 %% Output

    Requirement already satisfied: tqdm in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (4.66.2)
    Requirement already satisfied: lxml in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (5.2.2)

 %% Cell type:code id: tags:

 ``` python
 import os
 from functions import *
 ```

 %% Cell type:code id: tags:

 ``` python
 cwd = os.getcwd()
 documents_dir = os.path.join(f'{cwd}/data/genai4lex', 'Documents')
 results_dir = os.path.join(cwd, 'results/genai4lex')
 ```

 %% Cell type:markdown id: tags:

 ### download AKN documents from genai4lex repo

 %% Cell type:code id: tags:

 ``` python
 # Create results_dir if it does not exist
 if not os.path.isdir(results_dir):
    os.makedirs(results_dir)
 ```

 %% Cell type:code id: tags:

 ``` python

 # Check if the 'Documents' folder exists, if not, download and extract the zip file
 if not os.path.isdir(documents_dir):
    zip_url = "https://gitlab.com/CIRSFID/genai4lex/-/raw/main/LegalResources/Eur-Lex/2010-2021/Documents.zip?inline=false"
    download_and_extract_zip(cwd, zip_url)  # Ensure this function is defined to handle download and extraction

 if not os.path.isdir(documents_dir):
    print("Invalid directory path.")
    exit()
 ```

 %% Cell type:markdown id: tags:

 ### download AKN schema

 %% Cell type:code id: tags:

 ``` python

 schema_dir = os.path.join(cwd, 'schema')
 os.makedirs(schema_dir, exist_ok=True)

 # Download Akoma Ntoso Schema
 schema_url = "https://docs.oasis-open.org/legaldocml/akn-core/v1.0/os/part2-specs/schemas/akomantoso30.xsd"
 schema_path = os.path.join(schema_dir, 'akomantoso30.xsd')
 if not os.path.exists(schema_path):
    download_schema(schema_url, schema_path)
 ```

 %% Cell type:markdown id: tags:

 ### Analysis and statistics

 %% Cell type:code id: tags:

 ``` python
-
 # Assuming analyze_xml_files and the associated functions are defined elsewhere
 results, stats = analyze_xml_files(documents_dir,schema_path)
-
 ```

 %% Output

    Analyzing XML files: 100%|██████████| 15283/15283 [00:22<00:00, 674.22it/s]

 %% Cell type:code id: tags:

 ``` python
 output_csv_path = os.path.join(results_dir, 'results.csv')

 # Write results to CSV
 write_results_to_csv(results, output_csv_path)  # Ensure this function is defined


 stats_file_path = os.path.join(results_dir, 'statistics.csv')

 write_stats_to_file(stats, stats_file_path)  # Ensure this function is defined
 ```

 %% Cell type:code id: tags:

 ``` python
 print(stats)
 ```

 %% Output

    {'Average Total Pages': 4.1452594385919, 'Missing Total Pages': 0, 'Missing OJ Number': 0, 'Missing Publication Date': 0, 'Earliest Publication Date': '2010-01-05', 'Latest Publication Date': '2021-08-09'}

 %% Cell type:markdown id: tags:

 ## Schema validation

 %% Cell type:markdown id: tags:

 ### validate documents

 %% Cell type:code id: tags:

 ``` python
 results, valid, invalid = validate_xml_files(documents_dir, schema_path)
 ```

 %% Output

    Validating XML files: 100%|██████████| 15283/15283 [00:16<00:00, 934.79file/s]

 %% Cell type:code id: tags:

 ``` python
 output_csv_path = os.path.join(results_dir, 'validation_results.csv')
 write_results_to_csv2(results, output_csv_path)
 print(f"Validation results have been written to {output_csv_path}")
 ```

 %% Output

    Validation results have been written to /home/nasredine/dev/work/ai4xml/playground/analyze_akn_datasets/results/genai4lex/validation_results.csv

 %% Cell type:code id: tags:

 ``` python
 print(f'Number of valid files {valid}')
 print(f'Number of invalid files {invalid}')
 ```

 %% Output

    Number of valid files 12759
    Number of invalid files 2524

--- a/dspy_programs/cover_page.prog.ipynb
+++ b/dspy_programs/cover_page.prog.ipynb
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+!pip install python-dotenv
+!pip install rouge-score
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dotenv import load_dotenv
+import os
+import json
+```
+
+%% Cell type:code id: tags:
+
+``` python
+load_dotenv()
+api_key = os.getenv('OPENAI_API_KEY')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 1: Setup
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(api_key=api_key, model='gpt-3.5-turbo')
+dspy.settings.configure(lm=turbo)
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 2: Define Signatures
+
+%% Cell type:code id: tags:
+
+``` python
+# class Document:
+#     """A simple document class to simulate the expected structure."""
+#     def __init__(self, text):
+#         self.sections = [Section(text)]
+
+# class Section:
+#     """A section of the document."""
+#     def __init__(self, text):
+#         self.text = text
+
+class GenerateAKN(dspy.Signature):
+    """Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
+    text = dspy.InputField(desc="Raw text format of the document cover page")
+    xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input cover page")
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3: Building the Transformation Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+class DocumentToXMLPipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.transform = dspy.ChainOfThought(GenerateAKN)
+
+    def forward(self, text):
+        # Assuming there's some text to process, otherwise return an empty XML structure
+        if not text:
+            return ""
+
+        # Generate XML for the cover page
+        xml_cover_page = self.transform(text=text)
+
+        # Wrap in a root element
+        full_xml = f"<root>{xml_cover_page.xml}</root>"
+        return full_xml
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4: Executing the Pipeline (0-shot conversion without optimization)
+
+%% Cell type:code id: tags:
+
+``` python
+xml_pipeline = DocumentToXMLPipeline()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def process_documents(dataset):
+    cover_pages = [item['plain_text'] for item in dataset]
+    results = []
+
+    for doc in cover_pages:
+        xml_output = xml_pipeline(doc)
+        results.append(xml_output)
+
+    return results
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Running the pipeline with the example dataset
+full_xml_outputs = process_documents(example_dataset)
+for output in full_xml_outputs:
+    print(output)
+```
+
+%% Output
+
+    Prediction(
+        rationale='produce the xml. We will first identify the key elements of the cover page such as the title, date, proposal number, and the entities involved. We will then structure this information in the Akoma Ntoso (AKN) format.',
+        xml='```xml\n<coverPage>\n    <title>Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL amending Regulation (EC) No 1008/2008 on common rules for the operation of air services in the Community</title>\n    <date>21.12.2016</date>\n    <proposalNumber>2016/0411 (COD)</proposalNumber>\n    <entities>\n        <entity type="author">'
+    )
+    Prediction(
+        rationale='produce the xml. We need to identify the key elements of the document cover page such as the title, date, file number, sender, recipient, and subject. We will then structure this information in the Akoma Ntoso (AKN) format.',
+        xml='```xml\n<coverPage>\n  <title>COUNCIL OF THE EUROPEAN UNION</title>\n  <date>27 February 2017</date>\n  <language>en</language>\n  <fileNumber>2016/0030 (COD)</fileNumber>\n  <sender>General Secretariat of the Council</sender>\n  <recipient>Permanent Representatives Committee</recipient>\n  <subject>Proposal for a REGULATION OF THE EUROPEAN'
+    )
+
+%% Cell type:markdown id: tags:
+
+## Step 5: Optimizing the Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+def load_data_from_json(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+
+def prepare_example(text, expected_xml):
+    # Assuming 'dspy.Example' is the correct class from your DSPy framework
+    example = dspy.Example({
+        'text': text.strip(),  # Using strip() to clean whitespace
+        'expected_xml': expected_xml.strip()
+    }).with_inputs("text")
+    return example
+
+def create_dataset(data):
+    return [prepare_example(item['text'], item['expected_xml']) for item in data.values()]
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
+
+# Load and prepare the dataset
+data = load_data_from_json(file_path)
+trainset = create_dataset(data)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+trainset
+```
+
+%% Output
+
+    [Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>Commission Delegated Directive (EU) 2020/12</p>\n        <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n        <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/24</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>authorising an extension of use of chia seeds (<span class="ITALIC">Salvia hispanica</span>) as a novel food and the change of the conditions of use and the specific labelling requirements of chia seeds (<span class="ITALIC">Salvia hispanica</span>) under Regulation (EU) 2015/2283 of the European Parliament and of the Council and amending Commission Implementing Regulation (EU) 2017/2470</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/23</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>concerning the non-renewal of the approval of the active substance thiacloprid, in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market, and amending the Annex to Commission Implementing Regulation (EU) No 540/2011</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n        <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'})]
+
+%% Cell type:code id: tags:
+
+``` python
+trainset[0]['expected_xml']
+```
+
+%% Output
+
+    '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'
+
+%% Cell type:code id: tags:
+
+``` python
+from rouge_score import rouge_scorer
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def validate_xml_rouge_score(example, pred,trace=None):
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
+    scores = scorer.score(example['expected_xml'], pred)
+
+    # Extracting the F1 scores from the results
+    rouge1_f1 = scores['rouge1'].fmeasure
+    rougeL_f1 = scores['rougeL'].fmeasure
+
+    print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
+
+    # Setting a threshold for ROUGE-L
+    return rougeL_f1 >= 0.0  # Threshold can be adjusted as needed
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dspy.teleprompt import BootstrapFewShot
+
+teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
+compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
+```
+
+%% Output
+
+    100%|██████████| 2/2 [00:00<00:00, 34.16it/s]
+
+    ROUGE-1 F1: 0.430939226519337 | ROUGE-L F1: 0.34254143646408847
+    ROUGE-1 F1: 0.22372881355932203 | ROUGE-L F1: 0.21694915254237288
+    Bootstrapped 2 full traces after 2 examples in round 0.
+
+    
+%% Cell type:code id: tags:
+
+``` python
+!pip install dspy-ai
+!pip install python-dotenv
+!pip install rouge-score
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dotenv import load_dotenv
+import os
+import json
+```
+
+%% Cell type:code id: tags:
+
+``` python
+load_dotenv()
+api_key = os.getenv('OPENAI_API_KEY')
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 1: Setup
+
+%% Cell type:code id: tags:
+
+``` python
+import dspy
+
+turbo = dspy.OpenAI(api_key=api_key, model='gpt-3.5-turbo')
+dspy.settings.configure(lm=turbo)
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 2: Define Signatures
+
+%% Cell type:code id: tags:
+
+``` python
+# class Document:
+#     """A simple document class to simulate the expected structure."""
+#     def __init__(self, text):
+#         self.sections = [Section(text)]
+
+# class Section:
+#     """A section of the document."""
+#     def __init__(self, text):
+#         self.text = text
+
+class GenerateAKN(dspy.Signature):
+    """Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
+    text = dspy.InputField(desc="Raw text format of the document cover page")
+    xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input cover page")
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 3: Building the Transformation Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+class DocumentToXMLPipeline(dspy.Module):
+    def __init__(self):
+        super().__init__()
+        self.transform = dspy.ChainOfThought(GenerateAKN)
+
+    def forward(self, text):
+        # Assuming there's some text to process, otherwise return an empty XML structure
+        if not text:
+            return ""
+
+        # Generate XML for the cover page
+        xml_cover_page = self.transform(text=text)
+
+        # Wrap in a root element
+        full_xml = f"<root>{xml_cover_page.xml}</root>"
+        return full_xml
+```
+
+%% Cell type:markdown id: tags:
+
+## Step 4: Executing the Pipeline (0-shot conversion without optimization)
+
+%% Cell type:code id: tags:
+
+``` python
+xml_pipeline = DocumentToXMLPipeline()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def process_documents(dataset):
+    cover_pages = [item['plain_text'] for item in dataset]
+    results = []
+
+    for doc in cover_pages:
+        xml_output = xml_pipeline(doc)
+        results.append(xml_output)
+
+    return results
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Running the pipeline with the example dataset
+full_xml_outputs = process_documents(example_dataset)
+for output in full_xml_outputs:
+    print(output)
+```
+
+%% Output
+
+    Prediction(
+        rationale='produce the xml. We will first identify the key elements of the cover page such as the title, date, proposal number, and the entities involved. We will then structure this information in the Akoma Ntoso (AKN) format.',
+        xml='```xml\n<coverPage>\n    <title>Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL amending Regulation (EC) No 1008/2008 on common rules for the operation of air services in the Community</title>\n    <date>21.12.2016</date>\n    <proposalNumber>2016/0411 (COD)</proposalNumber>\n    <entities>\n        <entity type="author">'
+    )
+    Prediction(
+        rationale='produce the xml. We need to identify the key elements of the document cover page such as the title, date, file number, sender, recipient, and subject. We will then structure this information in the Akoma Ntoso (AKN) format.',
+        xml='```xml\n<coverPage>\n  <title>COUNCIL OF THE EUROPEAN UNION</title>\n  <date>27 February 2017</date>\n  <language>en</language>\n  <fileNumber>2016/0030 (COD)</fileNumber>\n  <sender>General Secretariat of the Council</sender>\n  <recipient>Permanent Representatives Committee</recipient>\n  <subject>Proposal for a REGULATION OF THE EUROPEAN'
+    )
+
+%% Cell type:markdown id: tags:
+
+## Step 5: Optimizing the Pipeline
+
+%% Cell type:code id: tags:
+
+``` python
+def load_data_from_json(file_path):
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+
+def prepare_example(text, expected_xml):
+    # Assuming 'dspy.Example' is the correct class from your DSPy framework
+    example = dspy.Example({
+        'text': text.strip(),  # Using strip() to clean whitespace
+        'expected_xml': expected_xml.strip()
+    }).with_inputs("text")
+    return example
+
+def create_dataset(data):
+    return [prepare_example(item['text'], item['expected_xml']) for item in data.values()]
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
+
+# Load and prepare the dataset
+data = load_data_from_json(file_path)
+trainset = create_dataset(data)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+trainset
+```
+
+%% Output
+
+    [Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>Commission Delegated Directive (EU) 2020/12</p>\n        <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n        <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/24</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>authorising an extension of use of chia seeds (<span class="ITALIC">Salvia hispanica</span>) as a novel food and the change of the conditions of use and the specific labelling requirements of chia seeds (<span class="ITALIC">Salvia hispanica</span>) under Regulation (EU) 2015/2283 of the European Parliament and of the Council and amending Commission Implementing Regulation (EU) 2017/2470</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/23</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>concerning the non-renewal of the approval of the active substance thiacloprid, in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market, and amending the Annex to Commission Implementing Regulation (EU) No 540/2011</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n        <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n        <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n        </p>\n        <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n        <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'}),
+     Example({'text': '', 'expected_xml': '<preface>\n      <longTitle>\n        <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n      </longTitle>\n    </preface>'}) (input_keys={'text'})]
+
+%% Cell type:code id: tags:
+
+``` python
+trainset[0]['expected_xml']
+```
+
+%% Output
+
+    '<preface>\n      <longTitle>\n        <p>\n          <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n        </p>\n        <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n        <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n        <p>(Text with EEA relevance)</p>\n      </longTitle>\n    </preface>'
+
+%% Cell type:code id: tags:
+
+``` python
+from rouge_score import rouge_scorer
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def validate_xml_rouge_score(example, pred,trace=None):
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
+    scores = scorer.score(example['expected_xml'], pred)
+
+    # Extracting the F1 scores from the results
+    rouge1_f1 = scores['rouge1'].fmeasure
+    rougeL_f1 = scores['rougeL'].fmeasure
+
+    print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
+
+    # Setting a threshold for ROUGE-L
+    return rougeL_f1 >= 0.0  # Threshold can be adjusted as needed
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from dspy.teleprompt import BootstrapFewShot
+
+teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
+compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
+```
+
+%% Output
+
+    100%|██████████| 2/2 [00:00<00:00, 34.16it/s]
+
+    ROUGE-1 F1: 0.430939226519337 | ROUGE-L F1: 0.34254143646408847
+    ROUGE-1 F1: 0.22372881355932203 | ROUGE-L F1: 0.21694915254237288
+    Bootstrapped 2 full traces after 2 examples in round 0.
+
+    
--- a/dspy_programs/cover_page.progX.ipynb
+++ b/dspy_programs/cover_page.progX.ipynb
--- a/dspy_programs/dataset_preparation.ipynb
+++ b/dspy_programs/dataset_preparation.ipynb
+%% Cell type:markdown id: tags:
+
+downlod data
+
+%% Cell type:code id: tags:
+
+``` python
+import requests
+import zipfile
+import os
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def download_and_extract_zip(url, extract_to):
+    """
+    Downloads a ZIP file from a URL and extracts it to a specified directory.
+    Only downloads and extracts if the directory does not already contain data.
+    """
+    # Check if the directory already contains data
+    if os.listdir(extract_to):
+        print(f"Data already exists in {extract_to}. Skipping download.")
+        return
+
+    # Make sure the output directory exists
+    os.makedirs(extract_to, exist_ok=True)
+
+    # Get the content from the URL
+    response = requests.get(url)
+    response.raise_for_status()  # Check that the request was successful
+
+    # Path to save the downloaded ZIP file
+    zip_path = os.path.join(extract_to, 'downloaded_files.zip')
+
+    # Write the content to a ZIP file
+    with open(zip_path, 'wb') as f:
+        f.write(response.content)
+
+    # Open the ZIP file and extract its contents
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(extract_to)
+
+    # Optionally, remove the ZIP file after extraction
+    os.remove(zip_path)
+    print(f"Files extracted to {extract_to}")
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+# URL of the file to be downloaded
+s3_url = "https://ai4xml-data.s3.eu-west-1.amazonaws.com/planJO/selection_for_gen4ai/gen4ai_related_files.zip"
+# Directory to store the extracted files
+output_dir = 'data/genai4lex_word_docs/'
+
+download_and_extract_zip(s3_url, output_dir)
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from functions import *
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Specify the folder containing XML files and the output JSON file name
+xml_folder = 'data/genai4lex_word_docs_xml'
+output_json = 'prefaces.json'
+
+extract_preface_content(xml_folder, output_json)
+```
+
+%% Cell type:markdown id: tags:
+
+XML comparaison for test
+replace rouge ??
+
+%% Cell type:code id: tags:
+
+``` python
+!pip install xmldiff
+```
+
+%% Output
+
+    Requirement already satisfied: xmldiff in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (2.7.0)
+    Requirement already satisfied: setuptools in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (58.0.4)
+    Requirement already satisfied: lxml>=3.1.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (5.2.2)
+
+%% Cell type:code id: tags:
+
+``` python
+from xmldiff import main, formatting
+
+def compare_xml_content(xml1, xml2):
+    """ Compare two XML documents and return the differences. """
+    diffs = main.diff_texts(xml1, xml2, formatter=formatting.XMLFormatter())
+    return diffs
+
+# Example XML documents
+xml1 = """<root>
+    <child1 attribute="value1">Text1</child1>
+    <child2>Text2</child2>
+</root>"""
+
+xml2 = """<root>
+    <child1 attribute="value1">Text1</child1>
+    <child2>Text3</child2>  <!-- Changed text -->
+    <child3>New child</child3>  <!-- New element -->
+</root>"""
+
+# Get the difference
+difference = compare_xml_content(xml1, xml2)
+print("Differences:", difference)
+```
+
+%% Output
+
+    Differences: <root xmlns:diff="http://namespaces.shoobx.com/diff">
+        <child1 attribute="value1">Text1</child1>
+        <child2>Text<diff:delete>2</diff:delete><diff:insert>3</diff:insert></child2><diff:delete>
+    </diff:delete><diff:insert>  </diff:insert><child3 diff:insert="">New child</child3><diff:insert>  </diff:insert></root>
+    
+
+%% Cell type:code id: tags:
+
+``` python
+import xml.etree.ElementTree as ET
+import logging
+
+class XmlTree():
+
+    def __init__(self):
+        self.hdlr = logging.FileHandler('xml-comparison.log')
+        self.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+
+    @staticmethod
+    def convert_string_to_tree( xmlString):
+
+        return ET.fromstring(xmlString)
+
+    def xml_compare(self, x1, x2, excludes=[]):
+        """
+        Compares two xml etrees
+        :param x1: the first tree
+        :param x2: the second tree
+        :param excludes: list of string of attributes to exclude from comparison
+        :return:
+            True if both files match
+        """
+
+        if x1.tag != x2.tag:
+            self.logger.debug('Tags do not match: %s and %s' % (x1.tag, x2.tag))
+            return False
+        for name, value in x1.attrib.items():
+            if not name in excludes:
+                if x2.attrib.get(name) != value:
+                    self.logger.debug('Attributes do not match: %s=%r, %s=%r'
+                                 % (name, value, name, x2.attrib.get(name)))
+                    return False
+        for name in x2.attrib.keys():
+            if not name in excludes:
+                if name not in x1.attrib:
+                    self.logger.debug('x2 has an attribute x1 is missing: %s'
+                                 % name)
+                    return False
+        if not self.text_compare(x1.text, x2.text):
+            self.logger.debug('text: %r != %r' % (x1.text, x2.text))
+            return False
+        if not self.text_compare(x1.tail, x2.tail):
+            self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
+            return False
+        cl1 = x1.getchildren()
+        cl2 = x2.getchildren()
+        if len(cl1) != len(cl2):
+            self.logger.debug('children length differs, %i != %i'
+                         % (len(cl1), len(cl2)))
+            return False
+        i = 0
+        for c1, c2 in zip(cl1, cl2):
+            i += 1
+            if not c1.tag in excludes:
+                if not self.xml_compare(c1, c2, excludes):
+                    self.logger.debug('children %i do not match: %s'
+                                 % (i, c1.tag))
+                    return False
+        return True
+
+    def text_compare(self, t1, t2):
+        """
+        Compare two text strings
+        :param t1: text one
+        :param t2: text two
+        :return:
+            True if a match
+        """
+        if not t1 and not t2:
+            return True
+        if t1 == '*' or t2 == '*':
+            return True
+        return (t1 or '').strip() == (t2 or '').strip()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+xml1 = "<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
+
+xml2 = "<note><to>Tove</to><from>Daniel</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
+
+tree1 = XmlTree.convert_string_to_tree(xml1)
+tree2 = XmlTree.convert_string_to_tree(xml2)
+
+comparator = XmlTree()
+
+if comparator.xml_compare(tree1, tree2, ["from"]):
+    print ("XMLs match")
+else:
+    print ("XMLs don't match")
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    AttributeError                            Traceback (most recent call last)
+Cell     In[6], line 10
+          6 tree2 = XmlTree.convert_string_to_tree(xml2)
+          8 comparator = XmlTree()
+    ---> 10 if comparator.xml_compare(tree1, tree2, ["from"]):
+         11     print ("XMLs match")
+         12 else:
+Cell     In[4], line 46, in XmlTree.xml_compare(self, x1, x2, excludes)
+         44     self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
+         45     return False
+    ---> 46 cl1 = x1.getchildren()
+         47 cl2 = x2.getchildren()
+         48 if len(cl1) != len(cl2):
+    AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'getchildren'
+
+%% Cell type:markdown id: tags:
+
+# extract text using pydoc
+
+%% Cell type:code id: tags:
+
+``` python
+# Set the path to your documents and the output JSON file
+root_folder = '/Users/nasredine/dev/work/playground/dspy_programs/data/genai4lex_word_docs_xml'
+output_json = 'md_text_output.json'
+
+process_documents(root_folder, output_json)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Example usage
+xml_data = """<your xml string here>"""
+md_data = """<your markdown string here>"""
+result = extract_and_find(xml_data, md_data)
+print(result)
+```
+%% Cell type:markdown id: tags:
+
+downlod data
+
+%% Cell type:code id: tags:
+
+``` python
+import requests
+import zipfile
+import os
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def download_and_extract_zip(url, extract_to):
+    """
+    Downloads a ZIP file from a URL and extracts it to a specified directory.
+    Only downloads and extracts if the directory does not already contain data.
+    """
+    # Check if the directory already contains data
+    if os.listdir(extract_to):
+        print(f"Data already exists in {extract_to}. Skipping download.")
+        return
+
+    # Make sure the output directory exists
+    os.makedirs(extract_to, exist_ok=True)
+
+    # Get the content from the URL
+    response = requests.get(url)
+    response.raise_for_status()  # Check that the request was successful
+
+    # Path to save the downloaded ZIP file
+    zip_path = os.path.join(extract_to, 'downloaded_files.zip')
+
+    # Write the content to a ZIP file
+    with open(zip_path, 'wb') as f:
+        f.write(response.content)
+
+    # Open the ZIP file and extract its contents
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(extract_to)
+
+    # Optionally, remove the ZIP file after extraction
+    os.remove(zip_path)
+    print(f"Files extracted to {extract_to}")
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+# URL of the file to be downloaded
+s3_url = "https://ai4xml-data.s3.eu-west-1.amazonaws.com/planJO/selection_for_gen4ai/gen4ai_related_files.zip"
+# Directory to store the extracted files
+output_dir = 'data/genai4lex_word_docs/'
+
+download_and_extract_zip(s3_url, output_dir)
+
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from functions import *
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Specify the folder containing XML files and the output JSON file name
+xml_folder = 'data/genai4lex_word_docs_xml'
+output_json = 'prefaces.json'
+
+extract_preface_content(xml_folder, output_json)
+```
+
+%% Cell type:markdown id: tags:
+
+XML comparaison for test
+replace rouge ??
+
+%% Cell type:code id: tags:
+
+``` python
+!pip install xmldiff
+```
+
+%% Output
+
+    Requirement already satisfied: xmldiff in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (2.7.0)
+    Requirement already satisfied: setuptools in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (58.0.4)
+    Requirement already satisfied: lxml>=3.1.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (5.2.2)
+
+%% Cell type:code id: tags:
+
+``` python
+from xmldiff import main, formatting
+
+def compare_xml_content(xml1, xml2):
+    """ Compare two XML documents and return the differences. """
+    diffs = main.diff_texts(xml1, xml2, formatter=formatting.XMLFormatter())
+    return diffs
+
+# Example XML documents
+xml1 = """<root>
+    <child1 attribute="value1">Text1</child1>
+    <child2>Text2</child2>
+</root>"""
+
+xml2 = """<root>
+    <child1 attribute="value1">Text1</child1>
+    <child2>Text3</child2>  <!-- Changed text -->
+    <child3>New child</child3>  <!-- New element -->
+</root>"""
+
+# Get the difference
+difference = compare_xml_content(xml1, xml2)
+print("Differences:", difference)
+```
+
+%% Output
+
+    Differences: <root xmlns:diff="http://namespaces.shoobx.com/diff">
+        <child1 attribute="value1">Text1</child1>
+        <child2>Text<diff:delete>2</diff:delete><diff:insert>3</diff:insert></child2><diff:delete>
+    </diff:delete><diff:insert>  </diff:insert><child3 diff:insert="">New child</child3><diff:insert>  </diff:insert></root>
+    
+
+%% Cell type:code id: tags:
+
+``` python
+import xml.etree.ElementTree as ET
+import logging
+
+class XmlTree():
+
+    def __init__(self):
+        self.hdlr = logging.FileHandler('xml-comparison.log')
+        self.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+
+    @staticmethod
+    def convert_string_to_tree( xmlString):
+
+        return ET.fromstring(xmlString)
+
+    def xml_compare(self, x1, x2, excludes=[]):
+        """
+        Compares two xml etrees
+        :param x1: the first tree
+        :param x2: the second tree
+        :param excludes: list of string of attributes to exclude from comparison
+        :return:
+            True if both files match
+        """
+
+        if x1.tag != x2.tag:
+            self.logger.debug('Tags do not match: %s and %s' % (x1.tag, x2.tag))
+            return False
+        for name, value in x1.attrib.items():
+            if not name in excludes:
+                if x2.attrib.get(name) != value:
+                    self.logger.debug('Attributes do not match: %s=%r, %s=%r'
+                                 % (name, value, name, x2.attrib.get(name)))
+                    return False
+        for name in x2.attrib.keys():
+            if not name in excludes:
+                if name not in x1.attrib:
+                    self.logger.debug('x2 has an attribute x1 is missing: %s'
+                                 % name)
+                    return False
+        if not self.text_compare(x1.text, x2.text):
+            self.logger.debug('text: %r != %r' % (x1.text, x2.text))
+            return False
+        if not self.text_compare(x1.tail, x2.tail):
+            self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
+            return False
+        cl1 = x1.getchildren()
+        cl2 = x2.getchildren()
+        if len(cl1) != len(cl2):
+            self.logger.debug('children length differs, %i != %i'
+                         % (len(cl1), len(cl2)))
+            return False
+        i = 0
+        for c1, c2 in zip(cl1, cl2):
+            i += 1
+            if not c1.tag in excludes:
+                if not self.xml_compare(c1, c2, excludes):
+                    self.logger.debug('children %i do not match: %s'
+                                 % (i, c1.tag))
+                    return False
+        return True
+
+    def text_compare(self, t1, t2):
+        """
+        Compare two text strings
+        :param t1: text one
+        :param t2: text two
+        :return:
+            True if a match
+        """
+        if not t1 and not t2:
+            return True
+        if t1 == '*' or t2 == '*':
+            return True
+        return (t1 or '').strip() == (t2 or '').strip()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+xml1 = "<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
+
+xml2 = "<note><to>Tove</to><from>Daniel</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
+
+tree1 = XmlTree.convert_string_to_tree(xml1)
+tree2 = XmlTree.convert_string_to_tree(xml2)
+
+comparator = XmlTree()
+
+if comparator.xml_compare(tree1, tree2, ["from"]):
+    print ("XMLs match")
+else:
+    print ("XMLs don't match")
+```
+
+%% Output
+
+    ---------------------------------------------------------------------------
+    AttributeError                            Traceback (most recent call last)
+Cell     In[6], line 10
+          6 tree2 = XmlTree.convert_string_to_tree(xml2)
+          8 comparator = XmlTree()
+    ---> 10 if comparator.xml_compare(tree1, tree2, ["from"]):
+         11     print ("XMLs match")
+         12 else:
+Cell     In[4], line 46, in XmlTree.xml_compare(self, x1, x2, excludes)
+         44     self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
+         45     return False
+    ---> 46 cl1 = x1.getchildren()
+         47 cl2 = x2.getchildren()
+         48 if len(cl1) != len(cl2):
+    AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'getchildren'
+
+%% Cell type:markdown id: tags:
+
+# extract text using pydoc
+
+%% Cell type:code id: tags:
+
+``` python
+# Set the path to your documents and the output JSON file
+root_folder = '/Users/nasredine/dev/work/playground/dspy_programs/data/genai4lex_word_docs_xml'
+output_json = 'md_text_output.json'
+
+process_documents(root_folder, output_json)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+# Example usage
+xml_data = """<your xml string here>"""
+md_data = """<your markdown string here>"""
+result = extract_and_find(xml_data, md_data)
+print(result)
+```
--- a/dspy_programs/full-doc.prog.ipynb
+++ b/dspy_programs/full-doc.prog.ipynb
--- a/dspy_programs/functions.py
+++ b/dspy_programs/functions.py
+import re
+import os
+import json
+import subprocess
+
+import xml.etree.ElementTree as ET
+
+import requests
+import zipfile
+
+
+def normalize_text(text):
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
+    return text.lower().strip()  # Convert to lowercase and strip leading/trailing spaces
+
+def extract_and_find(xml_string, md_string, length=30):
+    # Extract from XML
+    normalized_xml = normalize_text(xml_string)
+    start_segment = normalized_xml[:length]
+    end_segment = normalized_xml[-length:]
+    
+    # Normalize MD
+    normalized_md = normalize_text(md_string)
+    
+    # Find start and end segments in MD
+    start_index = normalized_md.find(start_segment)
+    end_index = normalized_md.rfind(end_segment)
+    
+    if start_index != -1 and end_index != -1 and start_index < end_index:
+        return f"Text likely spans from index {start_index} to {end_index + length} in the Markdown file."
+    return "Matching text not found in Markdown."
+
+
+
+
+def remove_namespaces(xml_element):
+    """ Recursively remove namespace prefixes from an XML element and its children. """
+    for elem in xml_element.iter():
+        if '}' in elem.tag:
+            elem.tag = elem.tag.split('}', 1)[1]  # Removes namespace
+        # Update attributes to remove namespaces
+        attributes = list(elem.attrib.keys())
+        for attr in attributes:
+            if '}' in attr:
+                new_attr = attr.split('}', 1)[1]
+                elem.attrib[new_attr] = elem.attrib.pop(attr)
+
+def remove_namespaces(xml_element):
+    """ Recursively remove namespace prefixes from an XML element and its children. """
+    for elem in xml_element.iter():
+        if '}' in elem.tag:
+            elem.tag = elem.tag.split('}', 1)[1]  # Removes namespace
+        # Update attributes to remove namespaces
+        attributes = list(elem.attrib.keys())
+        for attr in attributes:
+            if '}' in attr:
+                new_attr = attr.split('}', 1)[1]
+                elem.attrib[new_attr] = elem.attrib.pop(attr)
+
+def extract_preface_content(xml_folder, output_json):
+    results = {}
+
+    # Iterate over every XML file in the folder and its subfolders
+    for root_dir, sub_dirs, files in os.walk(xml_folder):
+        for filename in files:
+            if filename.endswith('.xml'):
+                # Remove the file extension from the filename
+                filename_no_ext = os.path.splitext(filename)[0]
+                file_path = os.path.join(root_dir, filename)
+                
+                # Parse the XML file
+                tree = ET.parse(file_path)
+                root = tree.getroot()
+                
+                # Remove namespaces from the root element
+                remove_namespaces(root)
+                
+                # Extract the entire <preface> element
+                preface = root.find('.//preface')
+                if preface is not None:
+                    # Convert the <preface> element to a string including its content
+                    preface_xml = ET.tostring(preface, encoding='unicode')
+                    results[filename_no_ext] = {
+                        'celex_id': filename_no_ext,
+                        'expected_xml': preface_xml,
+                        'text': ""
+                    }
+                else:
+                    results[filename_no_ext] = {
+                        'celex_id': filename_no_ext,
+                        'expected_xml': "No preface found",
+                        'text': ""
+                    }
+
+    # Write results to a JSON file
+    with open(output_json, 'w') as json_file:
+        json.dump(results, json_file, indent=4)
+        json.dump(results, json_file, indent=4)
+
+
+
+def convert_docx_to_md(docx_path):
+    """Convert a DOCX file to Markdown using Pandoc."""
+    try:
+        # Run pandoc to convert the docx file to markdown
+        result = subprocess.run(['pandoc', '-f', 'docx', '-t', 'markdown', docx_path], capture_output=True, text=True)
+        return result.stdout
+    except subprocess.CalledProcessError as e:
+        print("An error occurred while converting DOCX to Markdown:", e)
+        return None
+
+def process_documents(root_folder, output_json):
+    results = {}
+    # Walk through the directory structure
+    for root, dirs, files in os.walk(root_folder):
+        for file in files:
+            if file.endswith('.docx'):
+                celex_id = os.path.basename(root)  # Assuming the parent folder is the CELEX ID
+                docx_path = os.path.join(root, file)
+                markdown_text = convert_docx_to_md(docx_path)
+                if markdown_text:
+                    results[celex_id] = markdown_text
+
+    # Save results to a JSON file
+    with open(output_json, 'w') as json_file:
+        json.dump(results, json_file, indent=4)
+
+def download_and_extract_zip(script_dir, zip_url):
+    print("The 'Documents' folder does not exist. Downloading and extracting the zip file...")
+    
+    # Download the zip file
+    zip_file = os.path.join(script_dir, "Documents.zip")
+    response = requests.get(zip_url)
+    
+    with open(zip_file, "wb") as f:
+        f.write(response.content)
+    
+    # Extract the zip file
+    with zipfile.ZipFile(zip_file, "r") as zip_ref:
+        zip_ref.extractall(script_dir)
+    
+    # Remove the downloaded zip file
+    os.remove(zip_file)
\ No newline at end of file
--- a/dspy_programs/preface.prog.ipynb
+++ b/dspy_programs/preface.prog.ipynb
No results found