document types distribution by author

566a5ed4 · Nasredine CHENIKI · 2a70d693 · 566a5ed4 · 566a5ed4 · 566a5ed4
Commit 566a5ed4 authored 5 months ago by Nasredine CHENIKI
--- a/analyze_akn_datasets/analyze_eur-lex_genai4lex_dataset.ipynb
+++ b/analyze_akn_datasets/analyze_eur-lex_genai4lex_dataset.ipynb
--- a/analyze_akn_datasets/xml_analysis_utils.py
+++ b/analyze_akn_datasets/xml_analysis_utils.py
@@ -8,7 +8,7 @@ from lxml import etree
 from datetime import datetime


-def analyze_xml_files(directory,schema_path):
+def analyze_xml_files(directory,schema_path, only_planjo=False, planjo_docx_dir=None):
    results = []
    total_pages_sum = 0
    total_pages_count = 0
@@ -25,6 +25,8 @@ def analyze_xml_files(directory,schema_path):
    schema = etree.XMLSchema(file=schema_path)
    
    files = [f for f in os.listdir(directory) if f.endswith('.xml')]
+    if only_planjo:
+        files = [f for f in files if f.replace('.xml', '.docx') in os.listdir(planjo_docx_dir)]

    # Walk through all files in the specified directory with a progress bar
    for filename in tqdm(files, desc="Analyzing XML files"):
@@ -66,6 +68,10 @@ def analyze_xml_files(directory,schema_path):
            language = root.find('.//akn:FRBRlanguage', namespace).get('language') if root.find('.//akn:FRBRlanguage', namespace) is not None else 'Not found'
            series_type = root.find('.//fmx:COLL', namespace).text if root.find('.//fmx:COLL', namespace) is not None else 'Not found'
            act_type = root.find('.//akn:act', namespace).get('name') if root.find('.//akn:act', namespace) is not None else 'Not found'
+            author = root.find('.//akn:FRBRauthor', namespace).get('href') if root.find('.//akn:FRBRauthor', namespace) is not None else 'Not found'
+            
+            if author.startswith('#'):
+                author = author[1:]

            # Extract EuroVOC keywords
            eurovoc_keywords = []
@@ -85,7 +91,8 @@ def analyze_xml_files(directory,schema_path):
                'Series Type': series_type,
                'Act Type': act_type,
                'Schema Validation': valid,
-                'EuroVOC Keywords': eurovoc_keywords
+                'EuroVOC Keywords': eurovoc_keywords,
+                'Author': author
            })

        except ET.ParseError as e:
@@ -350,3 +357,106 @@ def plot_validation_results(number_of_valid_files, number_of_invalid_files):
    plt.title(f'XML Schema Validation Results\nTotal: {total_files:,} documents')
    plt.show()

+
+def generate_author_type_csv(results, output_file):
+    # Create dictionary to store type-author counts
+    type_author_counts = {}
+    
+    # Process results to count documents by type and author
+    for result in results:
+        author = result['Author']
+        act_type = result['Act Type']
+        
+        if author != 'Not found':
+            author = author.split('/')[-1]  # Extract last part of href URL
+            
+            if act_type not in type_author_counts:
+                type_author_counts[act_type] = {}
+                
+            if author not in type_author_counts[act_type]:
+                type_author_counts[act_type][author] = 0
+                
+            type_author_counts[act_type][author] += 1
+    
+    # Get unique list of all authors and their English labels
+    all_authors = []
+    unique_authors = sorted(set(author for counts in type_author_counts.values() 
+                              for author in counts.keys()))
+    
+    for author in unique_authors:
+        try:
+            author_url = f'http://publications.europa.eu/resource/authority/corporate-body/{author}'
+            response = requests.get(author_url)
+            if response.status_code == 200:
+                # Parse RDF response
+                root = ET.fromstring(response.content)
+                # Find English label using RDF namespaces
+                ns = {
+                    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
+                    'skos': 'http://www.w3.org/2004/02/skos/core#',
+                    'xml': 'http://www.w3.org/XML/1998/namespace'
+                }
+                # Find Description element for this author
+                desc = root.find(f".//rdf:Description[@rdf:about='{author_url}']", ns)
+                if desc is not None:
+                    # Find English prefLabel within Description
+                    label_elem = desc.find("skos:prefLabel[@xml:lang='en']", ns)
+                    if label_elem is not None:
+                        all_authors.append(label_elem.text)
+                        continue
+                all_authors.append(author)  # Fallback if no label found
+            else:
+                all_authors.append(author)  # Fallback if request fails
+        except:
+            all_authors.append(author)  # Fallback if any error occurs
+    # Write to CSV
+    with open(output_file, 'w', newline='') as f:
+        writer = csv.writer(f)
+        
+        # Write header row with authors
+        header = ['Document Type'] + all_authors
+        writer.writerow(header)
+        
+        # Write data rows
+        for doc_type in sorted(type_author_counts.keys()):
+            row = [doc_type]
+            for author in all_authors:
+                count = type_author_counts[doc_type].get(author, 0)
+                row.append(count)
+            writer.writerow(row)
+    
+    return type_author_counts, all_authors
+
+def plot_author_type_distribution(type_author_counts, all_authors):
+    # Convert to DataFrame for easier plotting
+    data = []
+    for doc_type in type_author_counts:
+        for author in all_authors:
+            count = type_author_counts[doc_type].get(author, 0)
+            if count > 0:  # Only include non-zero counts
+                data.append({
+                    'Document Type': doc_type,
+                    'Author': author,
+                    'Count': count
+                })
+    
+    df = pd.DataFrame(data)
+    
+    # Create a stacked bar plot
+    plt.figure(figsize=(15, 8))
+    pivot_table = df.pivot(index='Document Type', columns='Author', values='Count').fillna(0)
+    
+    # Plot only top 10 document types by total documents
+    top_10_types = pivot_table.sum(axis=1).sort_values(ascending=False).head(10).index
+    pivot_table_top10 = pivot_table.loc[top_10_types]
+    
+    ax = pivot_table_top10.plot(kind='bar', stacked=True)
+    
+    plt.title('Author Distribution by Top 10 Document Types')
+    plt.xlabel('Document Type')
+    plt.ylabel('Number of Documents')
+    plt.legend(title='Author', bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.xticks(rotation=45, ha='right')
+    plt.tight_layout()
+    plt.show()
+
--- a/dspy_programs/articles/datasets/dataset_html.ipynb
+++ b/dspy_programs/articles/datasets/dataset_html.ipynb
@@ -58,21 +58,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'parent_dir' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# load df from data/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;66;03m# produced from dataset_preparation_and_statistics notebook\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m DATA_DIR \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mparent_dir\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/data/\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      4\u001b[0m df_genai4lex_with_planjo_docx_cleaned \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'parent_dir' is not defined"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# load df from data/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv\n",
    "# produced from dataset_preparation_and_statistics notebook\n",
@@ -154,6 +142,13 @@
    "# Analysis"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Keep document with size < 20KB and no \"amending\" or \"correcting\" in the long title"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 7,
@@ -2383,7 +2378,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {

 %% Cell type:markdown id: tags:

 # Dataset Creation

 %% Cell type:markdown id: tags:

 This notebook prepares the preface dataset by converting docx files to HTML format.

 We use HTML as an intermediate format instead of Markdown because:
 - LLMs struggled with Markdown formatting
 - Particularly problematic was handling italic text in Markdown
 - HTML provides better structure and formatting control

 %% Cell type:code id: tags:

 ``` python
 %load_ext autoreload
 %autoreload 2
 ```

 %% Cell type:code id: tags:

 ``` python
 import os
 import sys
 import pandas as pd

 current_dir = os.getcwd()
 parent_dir = f"{os.path.dirname(current_dir)}/../"
 sys.path.append(parent_dir)

 from functions import *
 from xml_util import *
 ```

 %% Cell type:code id: tags:

 ``` python
 DATA_DIR = f"{parent_dir}/data/"
 ```

 %% Cell type:code id: tags:

 ``` python
 # load df from data/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv
 # produced from dataset_preparation_and_statistics notebook
 df_genai4lex_with_planjo_docx_cleaned = pd.read_csv(f"{DATA_DIR}/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv")
 ```

-%% Output
-
-    ---------------------------------------------------------------------------
-    NameError                                 Traceback (most recent call last)
-Cell     In[4], line 3
-          1 # load df from data/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv
-          2 # produced from dataset_preparation_and_statistics notebook
-    ----> 3 DATA_DIR = f"{parent_dir}/data/"
-          4 df_genai4lex_with_planjo_docx_cleaned = pd.read_csv(f"{DATA_DIR}/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv")
-    NameError: name 'parent_dir' is not defined
-
 %% Cell type:code id: tags:

 ``` python
 print("Unique act types:")
 print(df_genai4lex_with_planjo_docx_cleaned['Act Type'].unique())
 ```

 %% Output

    Unique act types:
    ['regulation/REG_IMPL' 'regulation/REG' 'directive/DIR'
     'regulation/REG_DEL' 'directive/DIR_DEL' 'regulation/CORRIGENDUM'
     'complementaryLegislation/REG' 'complementaryLegislation/DATPRO'
     'directive/DIR_IMPL' 'directive/CORRIGENDUM' 'regulation/REGDEL'
     'complementaryLegislation/DEC' 'decision' 'complementaryLegislation/ACT'
     'directive/REG']

 %% Cell type:markdown id: tags:

 ### Filter document types

 %% Cell type:code id: tags:

 ``` python
 # Filter for specific regulation types
 regulation_types = [
    'regulation/REG_IMPL',  # Implementing regulations
    'regulation/REG'        # Regular regulations
 ]

 df_regulations = df_genai4lex_with_planjo_docx_cleaned[df_genai4lex_with_planjo_docx_cleaned['Act Type'].isin(regulation_types)]
 print(f"Original dataset size: {len(df_genai4lex_with_planjo_docx_cleaned)}")
 print(f"Filtered dataset size: {len(df_regulations)}")
 ```

 %% Output

    Original dataset size: 3681
    Filtered dataset size: 2902

 %% Cell type:code id: tags:

 ``` python
 word_docs_dir = f"{DATA_DIR}/genai4lex_word_docs/"
 xml_docs_dir = f"{DATA_DIR}/genai4lex_dataset/"
 html_docs_dir = f"{DATA_DIR}/genai4lex_html_docs/"
 ```

 %% Cell type:markdown id: tags:

 # Analysis

+%% Cell type:markdown id: tags:
+
+## Keep document with size < 20KB and no "amending" or "correcting" in the long title
+
 %% Cell type:code id: tags:

 ``` python
 # Define size threshold in KB
 MAX_SIZE_KB = 20

 # keep only xml files that are in the df, have size < MAX_SIZE_KB, and don't have "amending" in their long title
 xml_files = []
 xml_sizes = {}
 for celex_id in df_regulations['celex_id']:
    xml_path = os.path.join(xml_docs_dir, f"{celex_id}.xml")
    if os.path.exists(xml_path):
        file_size = os.path.getsize(xml_path)
        if file_size < MAX_SIZE_KB * 1024:
            with open(xml_path, 'r', encoding='utf-8') as f:
                content = f.read()
                root = ET.fromstring(content)
                long_title = root.find('.//{*}longTitle')
                if long_title is not None:
                    title_text = ''.join(long_title.itertext())
                    if 'amending' not in title_text.lower() and 'correcting' not in title_text.lower():
                        xml_sizes[xml_path] = file_size

 # Sort files by size and extract just the paths
 xml_files = [path for path, size in sorted(xml_sizes.items(), key=lambda x: x[1], reverse=True)]
 ```

 %% Cell type:code id: tags:

 ``` python
 print(f"Number of XML files: {len(xml_files)}")
 print("Example celex ids with sizes:")
 for i in range(min(5, len(xml_files))):
    file_path = xml_files[i]
    file_id = os.path.splitext(os.path.basename(file_path))[0]
    size = xml_sizes[file_path] / 1024  # Convert to KB
    print(f"- {file_id}: {size:.2f} KB")
 ```

 %% Output

    Number of XML files: 1245
    Example celex ids with sizes:
    - 32017R0185: 19.82 KB
    - 32018R2018: 19.75 KB
    - 32018R1882: 19.68 KB
    - 32018R0581: 19.63 KB
    - 32018R0922: 19.53 KB

 %% Cell type:code id: tags:

 ``` python
 unique_tags = set()
 for xml_file in tqdm(xml_files, desc="Processing XML files"):
    with open(xml_file, "r", encoding="utf-8") as file:
        xml_content = file.read()
    doc_tags = extract_unique_tags_from_xml(xml_content, root_tag="body")
    unique_tags.update(doc_tags)

 print(f"Total unique tags: {len(unique_tags)}")
 for tag in sorted(unique_tags):
    print(f"- {tag}")
 ```

 %% Output

    Processing XML files: 100%|██████████| 1245/1245 [00:00<00:00, 3424.96it/s]

    Total unique tags: 24
    - alinea
    - article
    - authorialNote
    - body
    - content
    - date
    - def
    - defBody
    - heading
    - intro
    - list
    - mod
    - num
    - p
    - paragraph
    - point
    - quantity
    - quotedStructure
    - ref
    - span
    - table
    - td
    - term
    - tr

    

 %% Cell type:code id: tags:

 ``` python
 # Count occurrences of each tag across all files
 tag_counts = {tag: 0 for tag in unique_tags}
 total_files = len(xml_files)

 for xml_file in tqdm(xml_files, desc="Processing XML files"):
    with open(xml_file, "r", encoding="utf-8") as file:
        xml_content = file.read()
    doc_tags = extract_unique_tags_from_xml(xml_content, root_tag="body")
    for tag in doc_tags:
        tag_counts[tag] += 1

 # Calculate and display percentages
 print("\nTag usage statistics:")
 for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / total_files) * 100
    print(f"- {tag}: {count} files ({percentage:.1f}%)")
 ```

 %% Output

    Processing XML files: 100%|██████████| 1245/1245 [00:00<00:00, 6918.06it/s]

    
    Tag usage statistics:
    - article: 1245 files (100.0%)
    - body: 1245 files (100.0%)
    - alinea: 1245 files (100.0%)
    - content: 1245 files (100.0%)
    - p: 1245 files (100.0%)
    - num: 1245 files (100.0%)
    - span: 1234 files (99.1%)
    - date: 437 files (35.1%)
    - authorialNote: 193 files (15.5%)
    - ref: 190 files (15.3%)
    - heading: 186 files (14.9%)
    - paragraph: 156 files (12.5%)
    - point: 70 files (5.6%)
    - intro: 70 files (5.6%)
    - list: 70 files (5.6%)
    - mod: 37 files (3.0%)
    - quantity: 21 files (1.7%)
    - def: 10 files (0.8%)
    - tr: 8 files (0.6%)
    - td: 8 files (0.6%)
    - table: 8 files (0.6%)
    - defBody: 7 files (0.6%)
    - quotedStructure: 3 files (0.2%)
    - term: 3 files (0.2%)

    

 %% Cell type:code id: tags:

 ``` python
 target_tags = ['term']

 print("Files containing rare tags in preamble section:")
 for tag in target_tags:
    documents_with_body_having_tag, tag_contents = get_files_with_tag(xml_files, tag, target_part="body")
    print(f"\n{tag} appears in {len(documents_with_body_having_tag)} files:")
    print(documents_with_body_having_tag)
 ```

 %% Output

    Files containing rare tags in preamble section:

    Processing XML files: 100%|██████████| 1245/1245 [00:00<00:00, 7438.20it/s]

    
    term appears in 3 files:
    ['32018R1882', '32019R1685', '32018R0329']

    

 %% Cell type:code id: tags:

 ``` python
 # result, analysis_results = analyse_body_dataset_from_xml(xml_files)
 # analysis_results['tag_counts']
 ```

 %% Cell type:markdown id: tags:

 ## Articles extraction from xml and html files

 %% Cell type:markdown id: tags:

 ### Extract unique conclusion formulas from XML files
 These formulas are used to clean last article by removing the conclusion part

 %% Cell type:code id: tags:

 ``` python
 import os
 import xml.etree.ElementTree as ET
 from tqdm import tqdm

 conclusion_formulas = set()

 for xml_file in tqdm(xml_files, desc="Processing XML files for formulas"):
    celex_id = os.path.splitext(os.path.basename(xml_file))[0]

    try:
        with open(xml_file, "r", encoding="utf-8") as f:
            xml_content = f.read()

        # Parse XML content
        root = ET.fromstring(xml_content)

        # Find all conclusion formulas using ElementTree
        # Using {*} to match any namespace
        formulas = root.findall(".//{*}formula[@name='conclusionsFormula']/{*}p")

        for formula in formulas:
            formula_text = formula.text.strip() if formula.text else ""
            if formula_text:
                conclusion_formulas.add(formula_text)

    except ET.ParseError:
        print(f"Skipping malformed XML for celex_id: {celex_id}")
        continue
    except Exception as e:
        print(f"Error processing {celex_id}: {e}")
        continue

 print("Unique conclusion formulas found:")
 for formula in sorted(conclusion_formulas):
    print(f"\n- {formula}")
 ```

 %% Output

    Processing XML files for formulas: 100%|██████████| 1245/1245 [00:00<00:00, 3134.71it/s]

    Unique conclusion formulas found:
    
    - It shall apply from
    
    - This Regulation shall be binding in its entirely and directly applicable in all Member States.
    
    - This Regulation shall be binding in its entirety and directly applicable in all Member States.
    
    - This Regulation shall be binding in its entirety and directly applicable in the Member States in accordance with the Treaties.

    

 %% Cell type:code id: tags:

 ``` python
 print("Documents with formula containing 'It shall apply from':")
 for xml_file in tqdm(xml_files, desc="Checking XML files for target formula"):
    celex_id = os.path.splitext(os.path.basename(xml_file))[0]
    try:
        with open(xml_file, "r", encoding="utf-8") as f:
            xml_content = f.read()
        root = ET.fromstring(xml_content)
        # Find all <p> elements under formulas with name 'conclusionsFormula'
        formulas = root.findall(".//{*}formula[@name='conclusionsFormula']/{*}p")
        for formula in formulas:
            formula_text = formula.text.strip() if formula.text else ""
            if "It shall apply from" in formula_text:
                print(celex_id)
                break  # stop checking further formulas for this document
    except ET.ParseError:
        print(f"Skipping malformed XML for celex_id: {celex_id}")
    except Exception as e:
        print(f"Error processing {celex_id}: {e}")
 ```

 %% Output

    Documents with formula containing 'It shall apply from':

    Checking XML files for target formula:  21%|██        | 260/1245 [00:00<00:00, 2599.87it/s]

    32020R1801

    Checking XML files for target formula: 100%|██████████| 1245/1245 [00:00<00:00, 3638.80it/s]

 %% Cell type:markdown id: tags:

 removing edge case article that has wrong conclusion formula

 %% Cell type:code id: tags:

 ``` python
 # Remove document 32020R1801 from xml_files list
 xml_files = [f for f in xml_files if "32020R1801" not in f]
 ```

 %% Cell type:markdown id: tags:

 ### Extract articles from xml and html files

 %% Cell type:code id: tags:

 ``` python
 import json
 from tqdm import tqdm

 body = {}
 body_missing = {}

 count_found = 0
 count_missing = 0

 for xml_file in tqdm(xml_files, desc="Processing documents"):
    celex_id = os.path.splitext(os.path.basename(xml_file))[0]
    doc_akn_xml_path = xml_file

    if not os.path.exists(doc_akn_xml_path):
        body_missing[celex_id] = {'reason': 'XML file not found'}
        continue

    body_akn_xml = extract_document_part(doc_akn_xml_path, 'body')
    if body_akn_xml is None:
        body_missing[celex_id] = {'reason': 'Body not found in XML'}
        continue

    doc_html_path = os.path.join(html_docs_dir, f"{celex_id}.html")
    if not os.path.exists(doc_html_path):
        body_missing[celex_id] = {'reason': 'HTML file not found'}
        continue

    with open(doc_html_path, 'r') as file:
        doc_html = file.read()
        doc_html = doc_html.replace('\xa0', ' ') # replace non-breaking space with a space

    html_articles = extract_body_from_html(doc_html)

    if html_articles is None:
        body_missing[celex_id] = {'reason': 'HTML articles not found'}
        count_missing += 1
        continue

    # Extract articles from XML and store in dictionary
    xml_articles = {}
    root = ET.fromstring(body_akn_xml)

    # Find all article elements
    for article in root.findall('.//article'):
        # Get article number from num element
        num_elem = article.find('.//num')
        if num_elem is not None and 'Article' in num_elem.text:
            article_num = num_elem.text.replace('Article ', '')

            # Convert article element to string to preserve full XML content
            article_content = ET.tostring(article, encoding='unicode')
            xml_articles[article_num] = article_content

    # Process each HTML article and match with XML
    articles_matched = {}

    # First verify that number of articles matches between HTML and XML
    if html_articles and len(html_articles) != len(xml_articles):
        body_missing[celex_id] = {
            'reason': f'Mismatched article count - HTML: {len(html_articles)}, XML: {len(xml_articles)}'
        }
        count_missing += 1
        continue

    for html_article in html_articles:
        # Extract article number from HTML
        article_num_match = re.search(r'Article (\d+)', html_article)
        if article_num_match:
            article_num = article_num_match.group(1)

            # Verify this article exists in XML
            if article_num not in xml_articles:
                body_missing[celex_id] = {
                    'reason': f'Article {article_num} found in HTML but missing from XML'
                }
                count_missing += 1
                continue

            # Remove tags from HTML content
            tags_to_remove = ['mark']
            cleaned_html = html_article
            for tag in tags_to_remove:
                cleaned_html = cleaned_html.replace(f'<{tag}>', '').replace(f'</{tag}>', '')

            # Store both HTML and XML versions
            articles_matched[article_num] = {
                'html': cleaned_html,
                'xml': xml_articles[article_num]
            }

    # Verify all articles were matched
    if len(articles_matched) == len(xml_articles):
        body[celex_id] = {
            'articles': articles_matched,
            'full_xml': body_akn_xml
        }
        count_found += 1
    else:
        body_missing[celex_id] = {
            'reason': f'Not all articles matched - Matched: {len(articles_matched)}, Expected: {len(xml_articles)}',
            'html_content_sample': doc_html[:100]
        }
        count_missing += 1

 print(f"Found articles: {count_found}")
 print(f"Not found articles: {count_missing}")
 ```

 %% Output

    Processing documents: 100%|██████████| 1244/1244 [00:00<00:00, 1534.07it/s]

    Found articles: 1227
    Not found articles: 15

    

 %% Cell type:markdown id: tags:

 ## Preface dataset filtering and sorting based on content similarity between content of xml and html files

 %% Cell type:code id: tags:

 ``` python
 !pip install -q lxml tqdm beautifulsoup4 scikit-learn
 ```

 %% Output

    
    [1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
    [1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

 %% Cell type:code id: tags:

 ``` python
 from bs4 import BeautifulSoup
 import pandas as pd
 from tqdm import tqdm
 import re
 ```

 %% Cell type:code id: tags:

 ``` python
 def add_space_before_tags(xml_content):
    """Add a space before tags that are directly adjacent to text."""
    # Use regex to find tags that are directly adjacent to text and add a space before them
    return re.sub(r'(?<=\w)(<[^>]+>)', r' \1', xml_content)


 def extract_text_from_soup(soup):
    """Extract clean text from a BeautifulSoup object."""
    return soup.get_text()


 def normalize_text(text):
    """Normalize text by removing extra whitespace and converting text to lowercase."""
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()


 def combined_similarity(text1, text2, weights=None):
    """Calculate a combined similarity score using multiple methods."""
    if weights is None:
        # Default equal weights for each similarity measure
        weights = {
            'levenshtein': 0.33,
            'cosine': 0.33,
            'jaccard': 0.34
        }

    # Calculate individual similarity scores
    levenshtein_score = calculate_levenshtein_score(text1, text2)
    cosine_score = calculate_cosine_similarity(text1, text2)
    jaccard_score = jaccard_similarity(text1, text2)

    # Combine scores using the specified weights
    combined_score = (
        weights['levenshtein'] * levenshtein_score +
        weights['cosine'] * cosine_score +
        weights['jaccard'] * jaccard_score
    )

    return combined_score


 def process_document(celex_id, document):
    processed_articles = []

    # Get the articles dictionary from the document; each article already has 'html' and 'xml' keys.
    articles = document.get('articles', {})

    # Iterate over articles in sorted order (assuming keys are numeric strings)
    for key in sorted(articles, key=lambda k: int(k)):
        article = articles[key]
        html_article = article.get('html', '')
        xml_article_str = article.get('xml', '')

        # Extract and normalize text from the HTML content
        html_text = normalize_text(extract_text_from_soup(BeautifulSoup(html_article, 'html.parser')))

        # Extract and normalize text from the XML content
        xml_text = normalize_text(extract_text_from_soup(BeautifulSoup(xml_article_str, 'xml')))

        # Compute similarity between the normalized texts
        similarity = combined_similarity(html_text, xml_text)

        # Append tuple with celex_id, HTML content, XML content, and similarity score
        processed_articles.append((celex_id, html_article, xml_article_str, similarity))

    return processed_articles


 def create_articles_dataframe(documents_bodies):
    """Create a DataFrame from processed documents, where each row represents a single article.

    Expects documents_bodies to be a dictionary mapping celex_id to a dictionary with keys:
      - 'full_xml': complete XML content as a string.
      - 'articles': list of HTML strings, each representing an article.

    Returns a DataFrame with columns: celex_id, html, xml, similarity.
    """
    data = []
    for celex_id, document in tqdm(documents_bodies.items(), desc="Processing documents"):
        articles_data = process_document(celex_id, document)
        data.extend(articles_data)

    df = pd.DataFrame(data, columns=['celex_id', 'html', 'xml', 'similarity'])

    # Extract article IDs from XML
    def extract_article_id(xml_str):
        try:
            soup = BeautifulSoup(xml_str, 'xml')
            article = soup.find('article')
            if article and 'eId' in article.attrs:
                return article['eId']
            return None
        except:
            return None

    # Create unique ID combining celex_id and article ID
    df['article_id'] = df['xml'].apply(extract_article_id)
    df['id'] = df['celex_id'] + '_' + df['article_id']
    df.set_index('id', inplace=True)

    return df
 ```

 %% Cell type:code id: tags:

 ``` python
 # Create DataFrame
 df_articles = create_articles_dataframe(body)
 ```

 %% Output

    Processing documents: 100%|██████████| 1227/1227 [00:08<00:00, 152.38it/s]

 %% Cell type:code id: tags:

 ``` python
 df_articles
 ```

 %% Output

                        celex_id  \
    id
    32017R0185_art_1  32017R0185
    32017R0185_art_2  32017R0185
    32017R0185_art_3  32017R0185
    32017R0185_art_4  32017R0185
    32017R0185_art_5  32017R0185
    ...                      ...
    32016R1334_art_2  32016R1334
    32016R1687_art_1  32016R1687
    32016R1687_art_2  32016R1687
    32017R0490_art_1  32017R0490
    32017R0490_art_2  32017R0490
    
                                                                   html  \
    id
    32017R0185_art_1  <p><em>Article 1</em></p>\n<p><em>Subject matt...
    32017R0185_art_2  <p><em>Article 2</em></p>\n<p><em>Derogation c...
    32017R0185_art_3  <p><em>Article 3</em></p>\n<p><em>Derogation c...
    32017R0185_art_4  <p><em>Article 4</em></p>\n<p><em>Derogation c...
    32017R0185_art_5  <p><em>Article 5</em></p>\n<p><em>Entry into f...
    ...                                                             ...
    32016R1334_art_2  <p>Article 2</p>\n<p>This Regulation shall ent...
    32016R1687_art_1  <p>Article 1</p>\n<p>Annex III to Regulation (...
    32016R1687_art_2  <p>Article 2</p>\n<p>This Regulation shall ent...
    32017R0490_art_1  <p>Article 1</p>\n<p>Annex I to Regulation (EU...
    32017R0490_art_2  <p>Article 2</p>\n<p>This Regulation shall ent...
    
                                                                    xml  \
    id
    32017R0185_art_1  <article eId="art_1" GUID="001">\n        <num...
    32017R0185_art_2  <article eId="art_2" GUID="002">\n        <num...
    32017R0185_art_3  <article eId="art_3" GUID="003">\n        <num...
    32017R0185_art_4  <article eId="art_4" GUID="004">\n        <num...
    32017R0185_art_5  <article eId="art_5" GUID="005">\n        <num...
    ...                                                             ...
    32016R1334_art_2  <article eId="art_2" GUID="002">\n        <num...
    32016R1687_art_1  <article eId="art_1" GUID="001">\n        <num...
    32016R1687_art_2  <article eId="art_2" GUID="002">\n        <num...
    32017R0490_art_1  <article eId="art_1" GUID="001">\n        <num...
    32017R0490_art_2  <article eId="art_2" GUID="002">\n        <num...
    
                      similarity article_id
    id
    32017R0185_art_1    0.895940      art_1
    32017R0185_art_2    0.905612      art_2
    32017R0185_art_3    1.000000      art_3
    32017R0185_art_4    1.000000      art_4
    32017R0185_art_5    0.865238      art_5
    ...                      ...        ...
    32016R1334_art_2    0.942471      art_2
    32016R1687_art_1    1.000000      art_1
    32016R1687_art_2    0.476332      art_2
    32017R0490_art_1    1.000000      art_1
    32017R0490_art_2    0.942471      art_2
    
    [2938 rows x 5 columns]

 %% Cell type:code id: tags:

 ``` python
 print(f"Average similarity: {df_articles['similarity'].mean()}")
 ```

 %% Output

    Average similarity: 0.9239921332656981

 %% Cell type:markdown id: tags:

 ### Sort prefaces by similarity in descending order

 %% Cell type:code id: tags:

 ``` python
 # Sort DataFrame by similarity in descending order
 df_articles_sorted = df_articles.sort_values(by='similarity', ascending=False)

 # Print sorted results
 print("Top 2 documents with highest similarity:")
 print(df_articles_sorted.head(2)[['similarity']])

 print("\nBottom 2 documents with lowest similarity:")
 print(df_articles_sorted.tail(4)[['similarity']])

 print(f"\nAverage similarity: {df_articles['similarity'].mean():.4f}")
 ```

 %% Output

    Top 2 documents with highest similarity:
                      similarity
    id
    32020R1641_art_3         1.0
    32017R1110_art_5         1.0
    
    Bottom 2 documents with lowest similarity:
                      similarity
    id
    32018R1935_art_2    0.426355
    32019R0985_art_2    0.419034
    32020R0182_art_6    0.350657
    32016R1737_art_2    0.167723
    
    Average similarity: 0.9240

 %% Cell type:markdown id: tags:

 ## Saving data

 %% Cell type:markdown id: tags:

 ### save sorted dataset

 %% Cell type:code id: tags:

 ``` python
 df_articles_sorted
 ```

 %% Output

                        celex_id  \
    id
    32020R1641_art_3  32020R1641
    32017R1110_art_5  32017R1110
    32016R2075_art_1  32016R2075
    32016R2148_art_3  32016R2148
    32020R0501_art_4  32020R0501
    ...                      ...
    32016R0983_art_1  32016R0983
    32018R1935_art_2  32018R1935
    32019R0985_art_2  32019R0985
    32020R0182_art_6  32020R0182
    32016R1737_art_2  32016R1737
    
                                                                   html  \
    id
    32020R1641_art_3  <p>Article 3</p>\n<p><strong>Equivalence</stro...
    32017R1110_art_5  <p>Article 5</p>\n<p>Notification of changes t...
    32016R2075_art_1  <p>Article 1</p>\n<p>The maximum number of day...
    32016R2148_art_3  <p>Article 3</p>\n<p>Importers who have alread...
    32020R0501_art_4  <p>Article 4</p>\n<p>By way of derogation from...
    ...                                                             ...
    32016R0983_art_1  <p>Article 1</p>\n<p>Council Regulation (EC) N...
    32018R1935_art_2  <p>Article 2</p>\n<p>This Regulation shall ent...
    32019R0985_art_2  <p>Article 2</p>\n<p>This Regulation shall ent...
    32020R0182_art_6  <p>Article 6</p>\n<p>This Regulation shall ent...
    32016R1737_art_2  <p>Article 2</p>\n<p>This Regulation shall ent...
    
                                                                    xml  \
    id
    32020R1641_art_3  <article eId="art_3" GUID="003">\n        <num...
    32017R1110_art_5  <article eId="art_5" GUID="005">\n        <num...
    32016R2075_art_1  <article eId="art_1" GUID="001">\n        <num...
    32016R2148_art_3  <article eId="art_3" GUID="003">\n        <num...
    32020R0501_art_4  <article eId="art_4" GUID="004">\n        <num...
    ...                                                             ...
    32016R0983_art_1  <article eId="art_1" GUID="001">\n        <num...
    32018R1935_art_2  <article eId="art_2" GUID="002">\n        <num...
    32019R0985_art_2  <article eId="art_2" GUID="002">\n        <num...
    32020R0182_art_6  <article eId="art_6" GUID="006">\n        <num...
    32016R1737_art_2  <article eId="art_2" GUID="002">\n        <num...
    
                      similarity article_id
    id
    32020R1641_art_3    1.000000      art_3
    32017R1110_art_5    1.000000      art_5
    32016R2075_art_1    1.000000      art_1
    32016R2148_art_3    1.000000      art_3
    32020R0501_art_4    1.000000      art_4
    ...                      ...        ...
    32016R0983_art_1    0.439238      art_1
    32018R1935_art_2    0.426355      art_2
    32019R0985_art_2    0.419034      art_2
    32020R0182_art_6    0.350657      art_6
    32016R1737_art_2    0.167723      art_2
    
    [2938 rows x 5 columns]

 %% Cell type:code id: tags:

 ``` python
 import os
 import json

 # Create directory if it does not exist
 output_dir = f'{DATA_DIR}/articles/datasets/html'
 if not os.path.exists(output_dir):
    os.makedirs(output_dir)

 # Add document id with text and xml content
 df_articles_sorted = df_articles_sorted.reset_index()[['celex_id', 'xml', 'html', 'similarity']]

 # Save dataset
 dataset_file = os.path.join(output_dir, 'dataset.json')
 df_articles_sorted.to_json(dataset_file, orient='records', indent=2)

 # Create metadata file
 metadata = {
    'date': datetime.now().strftime('%Y-%m-%d'),
    'dataset_type': 'articles',
    'dataset_name': 'articles_html',
    'num_articles': len(df_articles_sorted),
    'columns': list(df_articles_sorted.columns),
    'avg_similarity': float(df_articles_sorted['similarity'].mean()),
    'min_similarity': float(df_articles_sorted['similarity'].min()),
    'max_similarity': float(df_articles_sorted['similarity'].max())
 }

 metadata_file = os.path.join(output_dir, 'metadata.json')
 with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)
 ```

 %% Cell type:markdown id: tags:

 ### Save sorted dataset to S3

 %% Cell type:code id: tags:

 ``` python
 from dotenv import load_dotenv
 import os
 import json
 import re
 import sys
 import dspy

 current_dir = os.getcwd()

 parent_dir = f"{os.path.dirname(current_dir)}"
 sys.path.append(f"{parent_dir}")

 from functions import *
 from xml_util import *

 load_dotenv()
 ```

 %% Output

    True

 %% Cell type:code id: tags:

 ``` python
 from dotenv import load_dotenv
 load_dotenv()
 ```

 %% Cell type:code id: tags:

 ``` python
 # S3 bucket and subfolder details
 bucket_name = "ai4xml-data"
 subfolder = "training/"  # Specify your subfolder path with trailing slash
 dataset_path = os.path.join(output_dir, 'dataset.json')

 # Upload dataset to S3 subfolder by prepending the subfolder path
 s3_path = subfolder + "prefaces_dataset.json"

 upload_success = upload_to_s3(
    dataset_path,
    bucket_name,
    s3_path,
    aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
 )

 if upload_success:
    print(f"Dataset successfully uploaded to s3://{bucket_name}/{s3_path}")
 else:
    print("Failed to upload dataset to S3")
 ```

 %% Output

    Upload Successful: training/prefaces_dataset.json
    Dataset successfully uploaded to s3://ai4xml-data/training/prefaces_dataset.json

 %% Cell type:markdown id: tags:

 # Analysis about dataset??

 %% Cell type:code id: tags:

 ``` python
 import pandas as pd
 import os

 output_dir = f'{DATA_DIR}/articles/datasets/html'

 # Read the dataset that was saved earlier
 dataset_file = os.path.join(output_dir, 'dataset.json')
 df_articles_sorted   = pd.read_json(dataset_file)

 print(f"Total records: {len(df_articles_sorted)}")
 ```

 %% Output

    Total records: 2938

 %% Cell type:code id: tags:

 ``` python
 # Analyze first 1000 documents
 doc_tags, all_unique_tags = analyze_tags_in_k_rows(df_articles_sorted, k=len(df_articles_sorted))
 ```

 %% Output

    Unique XML tags and their document counts from the first 2938 documents:
    - alinea: 1227 document(s)
    - article: 1227 document(s)
    - authorialNote: 191 document(s)
    - content: 1227 document(s)
    - date: 428 document(s)
    - def: 10 document(s)
    - defBody: 7 document(s)
    - heading: 181 document(s)
    - intro: 69 document(s)
    - list: 69 document(s)
    - mod: 35 document(s)
    - num: 1227 document(s)
    - p: 1227 document(s)
    - paragraph: 151 document(s)
    - point: 69 document(s)
    - quantity: 20 document(s)
    - quotedStructure: 2 document(s)
    - ref: 189 document(s)
    - span: 1216 document(s)
    - table: 8 document(s)
    - td: 8 document(s)
    - term: 3 document(s)
    - tr: 8 document(s)
    
    Total number of unique tags in first 2938 documents: 23

 %% Cell type:code id: tags:

 ``` python
 # Find documents containing 'authorialNote' tag

 search_tag = 'table'
 docs_with_search_tag = [id for id, tags in doc_tags.items() if search_tag in tags]

 docs_with_search_tag
 ```

 %% Output

    ['32019R0952', '32019R1926', '32018R0076', '32019R2219']

 %% Cell type:markdown id: tags:

 ## Extract articles unique tags

 %% Cell type:code id: tags:

 ``` python
 import xml.etree.ElementTree as ET

 def extract_tags(xml_string):
    try:
        root = ET.fromstring(xml_string)
        return [elem.tag for elem in root.iter()]
    except ET.ParseError:
        return []

 df_articles_sorted['tags'] = df_articles_sorted['xml'].apply(extract_tags)
 ```

 %% Cell type:code id: tags:

 ``` python
 df_articles_sorted.head()
 ```

 %% Output

         celex_id                                                xml  \
    0  32020R1641  <article eId="art_3" GUID="003">\n        <num...
    1  32017R1110  <article eId="art_5" GUID="005">\n        <num...
    2  32016R2075  <article eId="art_1" GUID="001">\n        <num...
    3  32016R2148  <article eId="art_3" GUID="003">\n        <num...
    4  32020R0501  <article eId="art_4" GUID="004">\n        <num...
    
                                                    html  similarity  \
    0  <p>Article 3</p>\n<p><strong>Equivalence</stro...         1.0
    1  <p>Article 5</p>\n<p>Notification of changes t...         1.0
    2  <p>Article 1</p>\n<p>The maximum number of day...         1.0
    3  <p>Article 3</p>\n<p>Importers who have alread...         1.0
    4  <p>Article 4</p>\n<p>By way of derogation from...         1.0
    
                                                    tags
    0        [article, num, heading, alinea, content, p]
    1  [article, num, heading, paragraph, num, alinea...
    2                 [article, num, alinea, content, p]
    3                 [article, num, alinea, content, p]
    4            [article, num, alinea, content, p, mod]

 %% Cell type:code id: tags:

 ``` python
 ignored_tags = ['mod']
 # remove articles that ONLY contain ignored tags
 df_articles_sorted['tags'] = df_articles_sorted['tags'].apply(lambda tags: [tag for tag in tags if tag not in ignored_tags])
 df_articles_sorted = df_articles_sorted[df_articles_sorted['tags'].apply(lambda tags: len(tags) > 0)]
 ```

 %% Cell type:markdown id: tags:

 # Creating a balanced dataset

 %% Cell type:code id: tags:

 ``` python
 df_articles_sorted['tag_label'] = df_articles_sorted['tags'].apply(lambda tags: '-'.join(sorted(set(tags))))
 ```

 %% Cell type:code id: tags:

 ``` python
 df_articles_sorted
 ```

 %% Output

            celex_id                                                xml  \
    0     32020R1641  <article eId="art_3" GUID="003">\n        <num...
    1     32017R1110  <article eId="art_5" GUID="005">\n        <num...
    2     32016R2075  <article eId="art_1" GUID="001">\n        <num...
    3     32016R2148  <article eId="art_3" GUID="003">\n        <num...
    4     32020R0501  <article eId="art_4" GUID="004">\n        <num...
    ...          ...                                                ...
    2933  32016R0983  <article eId="art_1" GUID="001">\n        <num...
    2934  32018R1935  <article eId="art_2" GUID="002">\n        <num...
    2935  32019R0985  <article eId="art_2" GUID="002">\n        <num...
    2936  32020R0182  <article eId="art_6" GUID="006">\n        <num...
    2937  32016R1737  <article eId="art_2" GUID="002">\n        <num...
    
                                                       html  similarity  \
    0     <p>Article 3</p>\n<p><strong>Equivalence</stro...    1.000000
    1     <p>Article 5</p>\n<p>Notification of changes t...    1.000000
    2     <p>Article 1</p>\n<p>The maximum number of day...    1.000000
    3     <p>Article 3</p>\n<p>Importers who have alread...    1.000000
    4     <p>Article 4</p>\n<p>By way of derogation from...    1.000000
    ...                                                 ...         ...
    2933  <p>Article 1</p>\n<p>Council Regulation (EC) N...    0.439238
    2934  <p>Article 2</p>\n<p>This Regulation shall ent...    0.426355
    2935  <p>Article 2</p>\n<p>This Regulation shall ent...    0.419034
    2936  <p>Article 6</p>\n<p>This Regulation shall ent...    0.350657
    2937  <p>Article 2</p>\n<p>This Regulation shall ent...    0.167723
    
                                                       tags  \
    0           [article, num, heading, alinea, content, p]
    1     [article, num, heading, paragraph, num, alinea...
    2                    [article, num, alinea, content, p]
    3                    [article, num, alinea, content, p]
    4                    [article, num, alinea, content, p]
    ...                                                 ...
    2933  [article, num, alinea, content, p, authorialNo...
    2934  [article, num, alinea, content, p, date, aline...
    2935           [article, num, alinea, content, p, span]
    2936           [article, num, alinea, content, p, span]
    2937           [article, num, alinea, content, p, span]
    
                                                 single_tag  \
    0     ['article', 'num', 'heading', 'alinea', 'conte...
    1     ['article', 'num', 'heading', 'paragraph', 'nu...
    2          ['article', 'num', 'alinea', 'content', 'p']
    3          ['article', 'num', 'alinea', 'content', 'p']
    4          ['article', 'num', 'alinea', 'content', 'p']
    ...                                                 ...
    2933  ['article', 'num', 'alinea', 'content', 'p', '...
    2934  ['article', 'num', 'alinea', 'content', 'p', '...
    2935  ['article', 'num', 'alinea', 'content', 'p', '...
    2936  ['article', 'num', 'alinea', 'content', 'p', '...
    2937  ['article', 'num', 'alinea', 'content', 'p', '...
    
                                                  tag_label
    0                  alinea-article-content-heading-num-p
    1        alinea-article-content-heading-num-p-paragraph
    2                          alinea-article-content-num-p
    3                          alinea-article-content-num-p
    4                          alinea-article-content-num-p
    ...                                                 ...
    2933  alinea-article-authorialNote-content-date-num-...
    2934                  alinea-article-content-date-num-p
    2935                  alinea-article-content-num-p-span
    2936                  alinea-article-content-num-p-span
    2937                  alinea-article-content-num-p-span
    
    [2938 rows x 7 columns]

 %% Cell type:code id: tags:

 ``` python
 # Find tag labels that appear only once (required for stratified split)
 tag_label_counts = df_articles_sorted['tag_label'].value_counts()
 single_occurrence_labels = tag_label_counts[tag_label_counts == 1].index

 # Remove rows with tags that only appear once
 df_articles_sorted = df_articles_sorted[~df_articles_sorted['tag_label'].isin(single_occurrence_labels)]

 print(f"Removed {len(single_occurrence_labels)} documents with unique tag combinations")
 ```

 %% Output

    Removed 35 documents with unique tag combinations

 %% Cell type:code id: tags:

 ``` python
 plot_tag_distribution(df_articles_sorted)
 ```

 %% Output



 %% Cell type:code id: tags:

 ``` python
 from sklearn.model_selection import train_test_split

 # Split into train and temp (80%) and test (20%)
 train_val_df, test_df = train_test_split(df_articles_sorted, test_size=0.2, random_state=42, stratify=df_articles_sorted['tag_label'])

 # Split train and validation from train_val (80% -> 20% train, 80% validation)
 # DSPy suggest to start with this configuration,  https://dspy.ai/learn/optimization/overview/?h=20%25

 train_df, val_df = train_test_split(train_val_df, test_size=0.8, random_state=42, stratify=train_val_df['tag_label'])

 # Output results
 print(f"Train Set Size: {len(train_df)}")
 print(f"Validation Set Size: {len(val_df)}")
 print(f"Test Set Size: {len(test_df)}")
 ```

 %% Output

    Train Set Size: 464
    Validation Set Size: 1858
    Test Set Size: 581

 %% Cell type:code id: tags:

 ``` python
 # Inspect tag distribution across splits
 def check_tag_distribution(split_name, split_df):
    tags_in_split = [tag for tags in split_df['tags'] for tag in tags]
    print(f"Tag Distribution in {split_name}: {Counter(tags_in_split)}")

 check_tag_distribution("Train", train_df)
 check_tag_distribution("Validation", val_df)
 check_tag_distribution("Test", test_df)
 ```

 %% Output

    Tag Distribution in Train: Counter({'p': 632, 'content': 588, 'num': 570, 'alinea': 552, 'article': 464, 'span': 226, 'date': 121, 'heading': 89, 'paragraph': 69, 'point': 39, 'authorialNote': 31, 'ref': 31, 'intro': 13, 'list': 12, 'mod': 8, 'defBody': 8, 'def': 6, 'quantity': 2})
    Tag Distribution in Validation: Counter({'p': 2598, 'content': 2392, 'num': 2273, 'alinea': 2270, 'article': 1858, 'span': 924, 'date': 482, 'heading': 356, 'paragraph': 298, 'point': 131, 'ref': 121, 'authorialNote': 116, 'intro': 47, 'list': 40, 'td': 35, 'mod': 26, 'quantity': 14, 'tr': 14, 'table': 7, 'def': 6, 'defBody': 6})
    Tag Distribution in Test: Counter({'p': 829, 'content': 754, 'num': 714, 'alinea': 710, 'article': 581, 'span': 283, 'date': 143, 'heading': 109, 'paragraph': 99, 'point': 47, 'authorialNote': 38, 'ref': 38, 'td': 18, 'intro': 17, 'list': 14, 'mod': 7, 'tr': 6, 'quantity': 4, 'def': 3, 'defBody': 3, 'table': 2})

 %% Cell type:code id: tags:

 ``` python
 doc_tags, all_unique_tags = analyze_tags_in_k_rows(test_df, k=100)
 ```

 %% Output

    All unique XML tags found in first 100 documents:
    - alinea
    - article
    - authorialNote
    - content
    - date
    - heading
    - intro
    - list
    - num
    - p
    - paragraph
    - point
    - quantity
    - ref
    - span
    - table
    - td
    - tr
    
    Total number of unique tags in first 100 documents: 18

 %% Cell type:code id: tags:

 ``` python
 # Get first 500 documents
 docs_subset = list(doc_tags.items())[:100]

 # Count frequency of each tag
 tag_frequency = defaultdict(int)
 for _, tags in docs_subset:
    for tag in tags:
        tag_frequency[tag] += 1

 # Sort tags by frequency in descending order
 sorted_tags = sorted(tag_frequency.items(), key=lambda x: x[1], reverse=True)

 print("Tag distribution in first 500 documents:")
 print("\nTag | Frequency | % of Documents")
 print("-" * 40)
 for tag, freq in sorted_tags:
    percentage = (freq / 500) * 100
    print(f"{tag:<20} {freq:>5} {percentage:>10.1f}%")

 print(f"\nTotal unique tags in first 500 documents: {len(tag_frequency)}")
 ```

 %% Output

    Tag distribution in first 500 documents:
    
    Tag | Frequency | % of Documents
    ----------------------------------------
    content                 98       19.6%
    article                 98       19.6%
    p                       98       19.6%
    num                     98       19.6%
    alinea                  97       19.4%
    span                    47        9.4%
    heading                 20        4.0%
    date                    16        3.2%
    paragraph                8        1.6%
    authorialNote            7        1.4%
    ref                      7        1.4%
    list                     4        0.8%
    point                    4        0.8%
    intro                    4        0.8%
    table                    1        0.2%
    tr                       1        0.2%
    td                       1        0.2%
    quantity                 1        0.2%
    
    Total unique tags in first 500 documents: 18

 %% Cell type:code id: tags:

 ``` python
 # Get tag statistics for train_df
 tag_counts = Counter([tag for tags in train_df['tags'] for tag in tags])
 total_tags = sum(tag_counts.values())

 print("\nDetailed Tag Statistics in Train Set:")
 print("-" * 50)
 print(f"Total number of tags: {total_tags}")
 print("\nTag frequencies:")
 for tag, count in tag_counts.most_common():
    percentage = (count / total_tags) * 100
    print(f"{tag}: {count} ({percentage:.2f}%)")
 ```

 %% Output

    
    Detailed Tag Statistics in Train Set:
    --------------------------------------------------
    Total number of tags: 3461
    
    Tag frequencies:
    p: 632 (18.26%)
    content: 588 (16.99%)
    num: 570 (16.47%)
    alinea: 552 (15.95%)
    article: 464 (13.41%)
    span: 226 (6.53%)
    date: 121 (3.50%)
    heading: 89 (2.57%)
    paragraph: 69 (1.99%)
    point: 39 (1.13%)
    authorialNote: 31 (0.90%)
    ref: 31 (0.90%)
    intro: 13 (0.38%)
    list: 12 (0.35%)
    mod: 8 (0.23%)
    defBody: 8 (0.23%)
    def: 6 (0.17%)
    quantity: 2 (0.06%)

 %% Cell type:code id: tags:

 ``` python
 ```

 %% Cell type:markdown id: tags:

 # Create stratified splits

 %% Cell type:markdown id: tags:

 ---------*******-------

 %% Cell type:code id: tags:

 ``` python
 # Add new column with length of xml content
 # Add column with xml length
 df_articles_sorted['xml_length'] = df_articles_sorted['xml'].str.len()

 # Display row with longest xml
 longest_xml = df_articles_sorted[:1000].nlargest(5, 'xml_length')
 longest_xml
 ```

 %% Output

           celex_id                                                xml  \
    938  32020R0533  <article eId="art_1" GUID="001">\n        <num...
    914  32016R0911  <article eId="art_2" GUID="002">\n        <num...
    912  32020R1546  <article eId="art_1" GUID="001">\n        <num...
    867  32018R0076  <article eId="art_1" GUID="001">\n        <num...
    653  32017R1272  <article eId="art_1" GUID="001">\n        <num...
    
                                                      html  similarity  \
    938  <h1 id="article-1"><em>Article 1</em></h1>\n<p...    0.980638
    914  <p>Article 2<br />\nTerms to be disclosed</p>\...    0.992917
    912  <p>Article 1</p>\n<p><strong>Inventory structu...    0.993462
    867  <p>Article 1</p>\n<p>(1) The fishing opportuni...    1.000000
    653  <p>Article 1</p>\n<p>1. The annual national ce...    1.000000
    
                                                      tags  xml_length
    938  [article, num, heading, paragraph, num, alinea...        5511
    914  [article, num, heading, paragraph, num, list, ...        5457
    912  [article, num, heading, paragraph, num, alinea...        4774
    867  [article, num, paragraph, num, list, intro, p,...        3813
    653  [article, num, paragraph, num, alinea, content...        3788

 %% Cell type:code id: tags:

 ``` python
 train_df, val_df, test_df = create_stratified_splits(df_articles_sorted[:1000], test_size=0.33, val_size=0.33)
 ```

 %% Cell type:code id: tags:

 ``` python
 import matplotlib.pyplot as plt
 import numpy as np



 # Create figure
 plt.figure(figsize=(15,6))

 # Get tag distributions for each dataset
 train_tags, _ = analyze_tags_in_k_rows(train_df, k=len(train_df))
 val_tags, _ = analyze_tags_in_k_rows(val_df, k=len(val_df))
 test_tags, _ = analyze_tags_in_k_rows(test_df, k=len(test_df))

 # Get data for each set
 train_tags, train_counts = plot_tag_distribution(train_tags, 'Training Set')
 val_tags, val_counts = plot_tag_distribution(val_tags, 'Validation Set')
 test_tags, test_counts = plot_tag_distribution(test_tags, 'Test Set')

 # Get union of all tags to ensure consistent x-axis
 all_tags = list(set(train_tags + val_tags + test_tags))
 all_tags.sort()

 # Create dictionaries mapping tags to counts, defaulting to 0 for missing tags
 train_dict = dict(zip(train_tags, train_counts))
 val_dict = dict(zip(val_tags, val_counts))
 test_dict = dict(zip(test_tags, test_counts))

 # Get counts in consistent order
 train_counts_aligned = [train_dict.get(tag, 0) for tag in all_tags]
 val_counts_aligned = [val_dict.get(tag, 0) for tag in all_tags]
 test_counts_aligned = [test_dict.get(tag, 0) for tag in all_tags]

 # Plot aligned distributions
 width = 0.25
 x = np.arange(len(all_tags))
 train_bars = plt.bar(x - width, train_counts_aligned, width, label='Training Set')
 val_bars = plt.bar(x, val_counts_aligned, width, label='Validation Set')
 test_bars = plt.bar(x + width, test_counts_aligned, width, label='Test Set')

 # Add value labels on the bars
 for bars in [train_bars, val_bars, test_bars]:
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom')

 plt.xticks(x, all_tags, rotation=45, ha='right')
 plt.title('Distribution of XML Tags Across Datasets')
 plt.xlabel('Tag Name')
 plt.ylabel('Number of Documents')
 plt.legend()
 plt.tight_layout()
 plt.show()
 ```

 %% Output

    Unique XML tags and their document counts from the first 342 documents:
    - alinea: 295 document(s)
    - article: 302 document(s)
    - content: 302 document(s)
    - date: 18 document(s)
    - def: 1 document(s)
    - heading: 76 document(s)
    - intro: 13 document(s)
    - list: 13 document(s)
    - mod: 13 document(s)
    - num: 302 document(s)
    - p: 302 document(s)
    - paragraph: 31 document(s)
    - point: 13 document(s)
    - quantity: 4 document(s)
    - ref: 1 document(s)
    - span: 3 document(s)
    - table: 3 document(s)
    - td: 3 document(s)
    - term: 1 document(s)
    - tr: 3 document(s)
    
    Total number of unique tags in first 342 documents: 20
    Unique XML tags and their document counts from the first 328 documents:
    - alinea: 280 document(s)
    - article: 282 document(s)
    - content: 282 document(s)
    - date: 18 document(s)
    - def: 1 document(s)
    - heading: 69 document(s)
    - intro: 12 document(s)
    - list: 12 document(s)
    - mod: 11 document(s)
    - num: 282 document(s)
    - p: 282 document(s)
    - paragraph: 27 document(s)
    - point: 12 document(s)
    - quantity: 2 document(s)
    - ref: 2 document(s)
    - span: 2 document(s)
    - term: 1 document(s)
    
    Total number of unique tags in first 328 documents: 17
    Unique XML tags and their document counts from the first 330 documents:
    - alinea: 290 document(s)
    - article: 295 document(s)
    - content: 294 document(s)
    - date: 17 document(s)
    - def: 2 document(s)
    - defBody: 1 document(s)
    - heading: 77 document(s)
    - intro: 14 document(s)
    - list: 14 document(s)
    - mod: 12 document(s)
    - num: 295 document(s)
    - p: 295 document(s)
    - paragraph: 27 document(s)
    - point: 14 document(s)
    - quantity: 2 document(s)
    - ref: 1 document(s)
    - span: 2 document(s)
    - table: 1 document(s)
    - td: 1 document(s)
    - term: 1 document(s)
    - tr: 1 document(s)
    
    Total number of unique tags in first 330 documents: 21



 %% Cell type:code id: tags:

 ``` python
 longest_xml = test_df[:1000].nlargest(5, 'xml_length')
 longest_xml
 ```

 %% Output

           celex_id                                                xml  \
    6    32020R0533  <article eId="art_1" GUID="001">\n        <num...
    307  32020R0750  <article eId="art_1" GUID="001">\n        <num...
    108  32018R0891  <article eId="art_1" GUID="001">\n        <num...
    191  32018R2018  <article eId="art_3" GUID="003">\n        <num...
    118  32018R1095  <article eId="art_1" GUID="001">\n        <num...
    
                                                      html  similarity  \
    6    <h1 id="article-1"><em>Article 1</em></h1>\n<p...    0.980638
    307  <p>Article 1</p>\n<p>1. By way of derogation f...    0.968563
    108  <p>Article 1</p>\n<p>The annual national ceili...    1.000000
    191  <p>Article 3</p>\n<p><strong>Content of the te...    0.985714
    118  <p>Article 1</p>\n<p>1. The fishing opportunit...    1.000000
    
                                                      tags  xml_length
    6    [article, num, heading, paragraph, num, alinea...        5511
    307  [article, num, paragraph, num, alinea, content...        3147
    108  [article, num, alinea, content, p, alinea, con...        2780
    191  [article, num, heading, list, intro, p, point,...        2697
    118  [article, num, paragraph, num, list, intro, p,...        2529

 %% Cell type:code id: tags:

 ``` python
 print(f"length of train_df: {len(train_df)}")
 print(f"length of val_df: {len(val_df)}")
 print(f"length of test_df: {len(test_df)}")
 ```

 %% Output

    length of train_df: 342
    length of val_df: 328
    length of test_df: 330

 %% Cell type:code id: tags:

 ``` python
 # Create directories if they don't exist
 import os
 os.makedirs(f'{DATA_DIR}/articles/datasets/html/stratified', exist_ok=True)

 # Save each dataset to a JSON file in the html subdirectory
 train_df.to_json(f'{DATA_DIR}/articles/datasets/html/stratified/train.json', orient='records', indent=2)
 val_df.to_json(f'{DATA_DIR}/articles/datasets/html/stratified/val.json', orient='records', indent=2)
 test_df.to_json(f'{DATA_DIR}/articles/datasets/html/stratified/test.json', orient='records', indent=2)
 ```

--- a/dspy_programs/functions.py
+++ b/dspy_programs/functions.py
@@ -275,6 +275,82 @@ def extract_preamble_from_text(content, file_path):
        }
    }

+
+def extract_preamble_from_html(content, file_path):
+    celex_id = os.path.basename(file_path).split('.')[0]
+
+    # Remove <mark> tags and their content
+    content = re.sub(r'<mark>([^<]*(?:<(?!/mark>)[^<]*)*)</mark>', r'\1', content)
+
+    # Pattern to capture the preamble components in HTML
+    pattern = r"((?:<p>THE EUROPEAN COMMISSION,|<p>THE COUNCIL OF THE EUROPEAN UNION,|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION|<p>THE EUROPEAN COMMISSION|<p>THE GOVERNING COUNCIL OF THE EUROPEAN CENTRAL BANK,|<p>THE COUNCIL OF THE EUROPEAN UNION|<p>THE COUNCIL OF THE OPEAN UNION,|<p>THE EUROPEAN PARLIAMENT,|<p>THE EUROPEAN COMMUNITIES,|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL,|<p>The GOVERNING COUNCIL OF THE EUROPEAN CENTRAL BANK,|<p>THE MANAGEMENT BOARD,).*?)(?=<p>HAS ADOPTED THIS REGULATION:|<p>HAVE ADOPTED THIS REGULATION:|<p>HAS ADOPTED THIS DIRECTIVE:|<p>HAVE ADOPTED THIS DIRECTIVE:|<p>HAS ADOPTED THIS DECISION:|<p>HAVE ADOPTED THIS DECISION:|<p>HAS ADOPTED THE FOLLOWING REGULATION:)"
+
+    match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
+
+    if not match:
+        return {
+            "success": False,
+            "error": "No preamble found",
+            "file_path": file_path
+        }
+
+    preamble = match.group(1).strip()
+
+    # Extract formula (commission declaration) using a separate function
+    formula = extract_formula_from_html(preamble)
+    if not formula:
+        return {
+            "success": False, 
+            "error": "No formula found",
+            "file_path": file_path
+        }
+
+    # Extract citations and recitals using a separate function
+    citations_part, recitals_part = extract_citations_and_recitals_from_html(preamble)
+
+    citations = extract_citations_from_html(citations_part)
+    if citations and citations[0] == formula:
+        citations = citations[1:]  # Remove the formula from citations
+    else:
+        return {
+            "success": False,
+            "error": "Formula not found as first citation",
+            "file_path": file_path,
+            "formula": formula,
+            "citations": citations
+        }
+
+    recitals_result = extract_recitals_from_html(recitals_part, celex_id)
+    
+    # Check if there are errors in recitals extraction
+    if recitals_result["errors"]:
+        return {
+            "success": False,
+            "error": "Errors in recitals extraction",
+            "file_path": file_path,
+            "recitals_errors": recitals_result["errors"]
+        }
+
+    # Extract preamble final using a separate function
+    preamble_final = extract_preamble_final_from_html(content)
+    
+    # Extract footnotes
+    footnotes = extract_footnotes_from_html(content)
+
+    return {
+        "success": True,
+        "data": {
+            "formula": formula.strip(),
+            "citations": citations,
+            "recitals": recitals_result["recitals"],
+            "recitals_duplicates": recitals_result["duplicates"],
+            "preamble_final": preamble_final.strip(),
+            "footnotes": footnotes
+        }
+    }
+
+
+
 def replace_footnotes_in_text(text, footnotes_dict):
    """Replace footnote references with their content in a list of text strings."""
    footnote_numbers = re.findall(r'\[\^(\d+)\]', text)
@@ -308,6 +384,26 @@ def extract_preamble_final(content):
    return final_phrase_match.group(1) if final_phrase_match else ""


+def extract_preamble_final_from_html(content):
+    import re
+
+    final_phrase_patterns = [
+        r"(<p>HAS ADOPTED THIS REGULATION:</p>)",
+        r"(<p>HAVE ADOPTED THIS REGULATION:</p>)", 
+        r"(<p>HAS ADOPTED THIS DIRECTIVE:</p>)",
+        r"(<p>HAVE ADOPTED THIS DIRECTIVE:</p>)",
+        r"(<p>HAS ADOPTED THIS DECISION:</p>)",
+        r"(<p>HAVE ADOPTED THIS DECISION:</p>)",
+        r"(<p>HAS ADOPTED THE FOLLOWING REGULATION:</p>)",
+    ]
+
+    for pattern in final_phrase_patterns:
+        final_phrase_match = re.search(pattern, content, re.IGNORECASE)
+        if final_phrase_match:
+            break
+    return final_phrase_match.group(1) if final_phrase_match else ""
+
+
 def extract_formula(preamble):
    import re

@@ -317,6 +413,23 @@ def extract_formula(preamble):
    return formula_match.group(1) if formula_match else None


+def extract_formula_from_html(html_content):
+    import re
+    from bs4 import BeautifulSoup
+
+    # Parse the HTML content
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    # Define the pattern for extracting the formula with p tags
+    formula_pattern = r"(<p>THE EUROPEAN COMMISSION,</p>|<p>THE COUNCIL OF THE EUROPEAN UNION,</p>|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,</p>|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,</p>|<p>THE EUROPEAN COMMISSION,</p>|<p>THE GOVERNING COUNCIL OF THE EUROPEAN CENTRAL BANK,</p>|<p>THE COUNCIL OF THE EUROPEAN UNION,</p>|<p>THE COUNCIL OF THE OPEAN UNION,</p>|<p>THE EUROPEAN PARLIAMENT,</p>|<p>THE EUROPEAN COMMUNITIES,</p>|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL,</p>|<p>The GOVERNING COUNCIL OF THE EUROPEAN CENTRAL BANK,</p>|<p>THE MANAGEMENT BOARD,</p>)"
+    formula_match = re.search(formula_pattern, html_content)
+    
+    if formula_match:
+        formula_html = formula_match.group(1)
+        return formula_html
+    return None
+
+
 def extract_citations_and_recitals(preamble):

    # Split the preamble into citations and recitals using 'Whereas' as the separator
@@ -331,6 +444,20 @@ def extract_citations_and_recitals(preamble):

    return citations_part, recitals_part

+def extract_citations_and_recitals_from_html(html_content):
+    
+    # Split the preamble into citations and recitals using 'Whereas' as the separator
+    splitters = ["<p>Whereas:</p>", "<p>WHEREAS:</p>", "<p>Whereas,</p>", "<p>Whereas</p>", "<p>whereas:</p>"]
+    for splitter in splitters:
+        if splitter in html_content:
+            citations_part, recitals_part = html_content.split(splitter, 1)
+            break
+    else:
+        citations_part = None
+        recitals_part = None
+
+    return citations_part, recitals_part
+

 def extract_citations(citations_part):
    if citations_part:
@@ -344,6 +471,18 @@ def extract_citations(citations_part):
        return []


+
+def extract_citations_from_html(citations_part):
+    if citations_part:
+        return [
+            citation.strip() + "</p>"
+            for citation in citations_part.split("</p>\n")
+            if citation.strip()
+        ]
+    else:
+        print("citation part is empty!")
+        return []
+
 def extract_recitals(recitals_text, celex_id):
    recitals = {}
    duplicates = []
@@ -389,6 +528,51 @@ def extract_recitals(recitals_text, celex_id):
    return result


+def extract_recitals_from_html(recitals_text, celex_id):
+    recitals = {}
+    duplicates = []
+    result = {
+        "recitals": {},
+        "duplicates": [],
+        "errors": []
+    }
+
+    # Corrected pattern to match actual new lines and the digit in parentheses
+    pattern = r"\n(?:<p>\s*(?:\((\d+)\)|<p>\((\d+)\)))"
+
+    # Find all matches of the pattern
+    matches = list(re.finditer(pattern, recitals_text))
+
+    for i in range(len(matches)):
+        start = matches[i].start()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(recitals_text)
+        recital_number = int(matches[i].group(1) or matches[i].group(2))
+        # if recital_number == 141:
+        #     print(f"recital_number: {recital_number}")
+        #     print(f"recitals_text[start:end]: {recitals_text[start:end]}")
+        recital_text = recitals_text[start:end].strip()
+        
+        if recital_number in recitals:
+            duplicates.append(recital_number)
+            # Extract the number value from the beginning of the recital text
+            number_match = re.match(r"^\s*(?:\\\((\d+)\\\)|\((\d+)\))", recital_text)
+            if number_match:
+                corrected_number = max(recitals.keys()) + 1
+                recital_text = re.sub(
+                    r"^\s*(?:\\\(\d+\\\)|\(\d+\))", f"({corrected_number})", recital_text
+                )
+                recitals[corrected_number] = recital_text
+            else:
+                result["errors"].append(f"Recital with no number found: {recital_text}")
+        else:
+            recitals[recital_number] = recital_text
+
+    result["recitals"] = recitals
+    result["duplicates"] = duplicates
+    
+    return result
+
+
 def extract_preamble_from_xml(xml_content):
    root = ET.fromstring(xml_content)

@@ -597,6 +781,49 @@ def create_preamble_dataset_from_md(content_files):
    return preamble_dataset, failed_analysis, recitals_duplicates


+def create_preamble_dataset_from_html(content_files):
+    
+    preamble_dataset = []
+    failed_analysis = []
+    recitals_duplicates = []
+
+    for file_path in tqdm(content_files, desc="Processing HTML files"):
+        with open(file_path, "r", encoding="utf-8") as file:
+            content = file.read()
+            filename_no_ext = os.path.splitext(os.path.basename(file_path))[0]
+
+            if content.strip().startswith("<img"):
+                failed_analysis.append({
+                    "file_path": file_path,
+                    "error": "File starts with image tag"
+                })
+                continue
+
+            preamble_data = extract_preamble_from_html(content, file_path)
+            
+            if preamble_data["success"]:
+                preamble_dataset.append({
+                    "celex_id": filename_no_ext,
+                    "formula": preamble_data["data"]["formula"],
+                    "citations": preamble_data["data"]["citations"], 
+                    "recitals": preamble_data["data"]["recitals"],
+                    "preamble_final": preamble_data["data"]["preamble_final"],
+                    "footnotes": preamble_data["data"]["footnotes"]
+                })
+                if preamble_data["data"]["recitals_duplicates"]:
+                    recitals_duplicates.append({
+                        "celex_id": filename_no_ext,
+                        "duplicates": preamble_data["data"]["recitals_duplicates"]
+                    })
+            else:
+                failed_analysis.append({
+                    "file_path": file_path,
+                    "error": preamble_data["error"]
+                })
+
+    return preamble_dataset, failed_analysis, recitals_duplicates
+
+
 def create_preamble_dataset_from_xml(xml_directory, output_file):
    preamble_dataset = []

@@ -1598,6 +1825,32 @@ def extract_footnotes(content):
    return footnotes


+def extract_footnotes_from_html(html_content):
+    """Extract footnotes from HTML content using BeautifulSoup."""
+    from bs4 import BeautifulSoup
+    
+    footnotes = {}
+    soup = BeautifulSoup(html_content, 'html.parser')
+    
+    # Find the footnotes section
+    footnotes_section = soup.find('section', id='footnotes')
+    if footnotes_section:
+        # Find all footnote list items
+        footnote_items = footnotes_section.find_all('li')
+        
+        for item in footnote_items:
+            number = item.get('id').replace('fn', '')
+            # Find and remove the backlink
+            p_tag = item.find('p')
+            backlink = p_tag.find('a', class_='footnote-back')
+            if backlink:
+                backlink.decompose()
+            # Get the remaining HTML content
+            footnote_content = str(p_tag)
+            footnotes[number] = footnote_content
+            
+    return footnotes
+
 # --------------- TAGS ANALYSIS ---------------

 def analyze_tag_usage(xml_files, root_tag="preface"):
@@ -1621,6 +1874,31 @@ def analyze_tag_usage(xml_files, root_tag="preface"):
        
    return tag_counts

+def get_files_with_specific_tags(xml_files, specific_tags, root_tag="preface"):
+    """
+    Get a list of file IDs (filenames without extension) containing specific XML tags.
+
+    Args:
+        xml_files: List of XML file paths
+        specific_tags: Set of tags to search for
+        root_tag: Root tag to start searching from
+
+    Returns:
+        List of file IDs that contain any of the specific tags
+    """
+    files_with_tags = []
+
+    for xml_file in tqdm(xml_files, desc="Searching for specific tags"):
+        with open(xml_file, "r", encoding="utf-8") as file:
+            xml_content = file.read()
+        doc_tags = extract_unique_tags_from_xml(xml_content, root_tag=root_tag)
+        
+        if any(tag in doc_tags for tag in specific_tags):
+            file_id = os.path.splitext(os.path.basename(xml_file))[0]
+            files_with_tags.append(file_id)
+
+    return files_with_tags
+
 def extract_tags(xml_string):
    try:
        root = ET.fromstring(xml_string)

--- a/dspy_programs/preamble/dataset_html.ipynb
+++ b/dspy_programs/preamble/dataset_html.ipynb