Code development platform for open source projects from the European Union institutions :large_blue_circle: EU Login authentication by SMS has been phased out. To see alternatives please check here

Skip to content
Snippets Groups Projects
Commit 566a5ed4 authored by Nasredine CHENIKI's avatar Nasredine CHENIKI
Browse files

document types distribution by author

parent 2a70d693
Branches dev
No related tags found
No related merge requests found
......@@ -8,7 +8,7 @@ from lxml import etree
from datetime import datetime
def analyze_xml_files(directory,schema_path):
def analyze_xml_files(directory,schema_path, only_planjo=False, planjo_docx_dir=None):
results = []
total_pages_sum = 0
total_pages_count = 0
......@@ -25,6 +25,8 @@ def analyze_xml_files(directory,schema_path):
schema = etree.XMLSchema(file=schema_path)
files = [f for f in os.listdir(directory) if f.endswith('.xml')]
if only_planjo:
files = [f for f in files if f.replace('.xml', '.docx') in os.listdir(planjo_docx_dir)]
# Walk through all files in the specified directory with a progress bar
for filename in tqdm(files, desc="Analyzing XML files"):
......@@ -66,6 +68,10 @@ def analyze_xml_files(directory,schema_path):
language = root.find('.//akn:FRBRlanguage', namespace).get('language') if root.find('.//akn:FRBRlanguage', namespace) is not None else 'Not found'
series_type = root.find('.//fmx:COLL', namespace).text if root.find('.//fmx:COLL', namespace) is not None else 'Not found'
act_type = root.find('.//akn:act', namespace).get('name') if root.find('.//akn:act', namespace) is not None else 'Not found'
author = root.find('.//akn:FRBRauthor', namespace).get('href') if root.find('.//akn:FRBRauthor', namespace) is not None else 'Not found'
if author.startswith('#'):
author = author[1:]
# Extract EuroVOC keywords
eurovoc_keywords = []
......@@ -85,7 +91,8 @@ def analyze_xml_files(directory,schema_path):
'Series Type': series_type,
'Act Type': act_type,
'Schema Validation': valid,
'EuroVOC Keywords': eurovoc_keywords
'EuroVOC Keywords': eurovoc_keywords,
'Author': author
})
except ET.ParseError as e:
......@@ -350,3 +357,106 @@ def plot_validation_results(number_of_valid_files, number_of_invalid_files):
plt.title(f'XML Schema Validation Results\nTotal: {total_files:,} documents')
plt.show()
def generate_author_type_csv(results, output_file):
# Create dictionary to store type-author counts
type_author_counts = {}
# Process results to count documents by type and author
for result in results:
author = result['Author']
act_type = result['Act Type']
if author != 'Not found':
author = author.split('/')[-1] # Extract last part of href URL
if act_type not in type_author_counts:
type_author_counts[act_type] = {}
if author not in type_author_counts[act_type]:
type_author_counts[act_type][author] = 0
type_author_counts[act_type][author] += 1
# Get unique list of all authors and their English labels
all_authors = []
unique_authors = sorted(set(author for counts in type_author_counts.values()
for author in counts.keys()))
for author in unique_authors:
try:
author_url = f'http://publications.europa.eu/resource/authority/corporate-body/{author}'
response = requests.get(author_url)
if response.status_code == 200:
# Parse RDF response
root = ET.fromstring(response.content)
# Find English label using RDF namespaces
ns = {
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'skos': 'http://www.w3.org/2004/02/skos/core#',
'xml': 'http://www.w3.org/XML/1998/namespace'
}
# Find Description element for this author
desc = root.find(f".//rdf:Description[@rdf:about='{author_url}']", ns)
if desc is not None:
# Find English prefLabel within Description
label_elem = desc.find("skos:prefLabel[@xml:lang='en']", ns)
if label_elem is not None:
all_authors.append(label_elem.text)
continue
all_authors.append(author) # Fallback if no label found
else:
all_authors.append(author) # Fallback if request fails
except:
all_authors.append(author) # Fallback if any error occurs
# Write to CSV
with open(output_file, 'w', newline='') as f:
writer = csv.writer(f)
# Write header row with authors
header = ['Document Type'] + all_authors
writer.writerow(header)
# Write data rows
for doc_type in sorted(type_author_counts.keys()):
row = [doc_type]
for author in all_authors:
count = type_author_counts[doc_type].get(author, 0)
row.append(count)
writer.writerow(row)
return type_author_counts, all_authors
def plot_author_type_distribution(type_author_counts, all_authors):
# Convert to DataFrame for easier plotting
data = []
for doc_type in type_author_counts:
for author in all_authors:
count = type_author_counts[doc_type].get(author, 0)
if count > 0: # Only include non-zero counts
data.append({
'Document Type': doc_type,
'Author': author,
'Count': count
})
df = pd.DataFrame(data)
# Create a stacked bar plot
plt.figure(figsize=(15, 8))
pivot_table = df.pivot(index='Document Type', columns='Author', values='Count').fillna(0)
# Plot only top 10 document types by total documents
top_10_types = pivot_table.sum(axis=1).sort_values(ascending=False).head(10).index
pivot_table_top10 = pivot_table.loc[top_10_types]
ax = pivot_table_top10.plot(kind='bar', stacked=True)
plt.title('Author Distribution by Top 10 Document Types')
plt.xlabel('Document Type')
plt.ylabel('Number of Documents')
plt.legend(title='Author', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
%% Cell type:markdown id: tags:
# Dataset Creation
%% Cell type:markdown id: tags:
This notebook prepares the preface dataset by converting docx files to HTML format.
We use HTML as an intermediate format instead of Markdown because:
- LLMs struggled with Markdown formatting
- Particularly problematic was handling italic text in Markdown
- HTML provides better structure and formatting control
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
```
%% Cell type:code id: tags:
``` python
import os
import sys
import pandas as pd
current_dir = os.getcwd()
parent_dir = f"{os.path.dirname(current_dir)}/../"
sys.path.append(parent_dir)
from functions import *
from xml_util import *
```
%% Cell type:code id: tags:
``` python
DATA_DIR = f"{parent_dir}/data/"
```
%% Cell type:code id: tags:
``` python
# load df from data/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv
# produced from dataset_preparation_and_statistics notebook
df_genai4lex_with_planjo_docx_cleaned = pd.read_csv(f"{DATA_DIR}/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv")
```
%% Output
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 3
1 # load df from data/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv
2 # produced from dataset_preparation_and_statistics notebook
----> 3 DATA_DIR = f"{parent_dir}/data/"
4 df_genai4lex_with_planjo_docx_cleaned = pd.read_csv(f"{DATA_DIR}/genai4lex_dataset_with_corresponding_planjo_docx_cleaned.csv")
NameError: name 'parent_dir' is not defined
%% Cell type:code id: tags:
``` python
print("Unique act types:")
print(df_genai4lex_with_planjo_docx_cleaned['Act Type'].unique())
```
%% Output
Unique act types:
['regulation/REG_IMPL' 'regulation/REG' 'directive/DIR'
'regulation/REG_DEL' 'directive/DIR_DEL' 'regulation/CORRIGENDUM'
'complementaryLegislation/REG' 'complementaryLegislation/DATPRO'
'directive/DIR_IMPL' 'directive/CORRIGENDUM' 'regulation/REGDEL'
'complementaryLegislation/DEC' 'decision' 'complementaryLegislation/ACT'
'directive/REG']
%% Cell type:markdown id: tags:
### Filter document types
%% Cell type:code id: tags:
``` python
# Filter for specific regulation types
regulation_types = [
'regulation/REG_IMPL', # Implementing regulations
'regulation/REG' # Regular regulations
]
df_regulations = df_genai4lex_with_planjo_docx_cleaned[df_genai4lex_with_planjo_docx_cleaned['Act Type'].isin(regulation_types)]
print(f"Original dataset size: {len(df_genai4lex_with_planjo_docx_cleaned)}")
print(f"Filtered dataset size: {len(df_regulations)}")
```
%% Output
Original dataset size: 3681
Filtered dataset size: 2902
%% Cell type:code id: tags:
``` python
word_docs_dir = f"{DATA_DIR}/genai4lex_word_docs/"
xml_docs_dir = f"{DATA_DIR}/genai4lex_dataset/"
html_docs_dir = f"{DATA_DIR}/genai4lex_html_docs/"
```
%% Cell type:markdown id: tags:
# Analysis
%% Cell type:markdown id: tags:
## Keep document with size < 20KB and no "amending" or "correcting" in the long title
%% Cell type:code id: tags:
``` python
# Define size threshold in KB
MAX_SIZE_KB = 20
# keep only xml files that are in the df, have size < MAX_SIZE_KB, and don't have "amending" in their long title
xml_files = []
xml_sizes = {}
for celex_id in df_regulations['celex_id']:
xml_path = os.path.join(xml_docs_dir, f"{celex_id}.xml")
if os.path.exists(xml_path):
file_size = os.path.getsize(xml_path)
if file_size < MAX_SIZE_KB * 1024:
with open(xml_path, 'r', encoding='utf-8') as f:
content = f.read()
root = ET.fromstring(content)
long_title = root.find('.//{*}longTitle')
if long_title is not None:
title_text = ''.join(long_title.itertext())
if 'amending' not in title_text.lower() and 'correcting' not in title_text.lower():
xml_sizes[xml_path] = file_size
# Sort files by size and extract just the paths
xml_files = [path for path, size in sorted(xml_sizes.items(), key=lambda x: x[1], reverse=True)]
```
%% Cell type:code id: tags:
``` python
print(f"Number of XML files: {len(xml_files)}")
print("Example celex ids with sizes:")
for i in range(min(5, len(xml_files))):
file_path = xml_files[i]
file_id = os.path.splitext(os.path.basename(file_path))[0]
size = xml_sizes[file_path] / 1024 # Convert to KB
print(f"- {file_id}: {size:.2f} KB")
```
%% Output
Number of XML files: 1245
Example celex ids with sizes:
- 32017R0185: 19.82 KB
- 32018R2018: 19.75 KB
- 32018R1882: 19.68 KB
- 32018R0581: 19.63 KB
- 32018R0922: 19.53 KB
%% Cell type:code id: tags:
``` python
unique_tags = set()
for xml_file in tqdm(xml_files, desc="Processing XML files"):
with open(xml_file, "r", encoding="utf-8") as file:
xml_content = file.read()
doc_tags = extract_unique_tags_from_xml(xml_content, root_tag="body")
unique_tags.update(doc_tags)
print(f"Total unique tags: {len(unique_tags)}")
for tag in sorted(unique_tags):
print(f"- {tag}")
```
%% Output
Processing XML files: 100%|██████████| 1245/1245 [00:00<00:00, 3424.96it/s]
Total unique tags: 24
- alinea
- article
- authorialNote
- body
- content
- date
- def
- defBody
- heading
- intro
- list
- mod
- num
- p
- paragraph
- point
- quantity
- quotedStructure
- ref
- span
- table
- td
- term
- tr
%% Cell type:code id: tags:
``` python
# Count occurrences of each tag across all files
tag_counts = {tag: 0 for tag in unique_tags}
total_files = len(xml_files)
for xml_file in tqdm(xml_files, desc="Processing XML files"):
with open(xml_file, "r", encoding="utf-8") as file:
xml_content = file.read()
doc_tags = extract_unique_tags_from_xml(xml_content, root_tag="body")
for tag in doc_tags:
tag_counts[tag] += 1
# Calculate and display percentages
print("\nTag usage statistics:")
for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True):
percentage = (count / total_files) * 100
print(f"- {tag}: {count} files ({percentage:.1f}%)")
```
%% Output
Processing XML files: 100%|██████████| 1245/1245 [00:00<00:00, 6918.06it/s]
Tag usage statistics:
- article: 1245 files (100.0%)
- body: 1245 files (100.0%)
- alinea: 1245 files (100.0%)
- content: 1245 files (100.0%)
- p: 1245 files (100.0%)
- num: 1245 files (100.0%)
- span: 1234 files (99.1%)
- date: 437 files (35.1%)
- authorialNote: 193 files (15.5%)
- ref: 190 files (15.3%)
- heading: 186 files (14.9%)
- paragraph: 156 files (12.5%)
- point: 70 files (5.6%)
- intro: 70 files (5.6%)
- list: 70 files (5.6%)
- mod: 37 files (3.0%)
- quantity: 21 files (1.7%)
- def: 10 files (0.8%)
- tr: 8 files (0.6%)
- td: 8 files (0.6%)
- table: 8 files (0.6%)
- defBody: 7 files (0.6%)
- quotedStructure: 3 files (0.2%)
- term: 3 files (0.2%)
%% Cell type:code id: tags:
``` python
target_tags = ['term']
print("Files containing rare tags in preamble section:")
for tag in target_tags:
documents_with_body_having_tag, tag_contents = get_files_with_tag(xml_files, tag, target_part="body")
print(f"\n{tag} appears in {len(documents_with_body_having_tag)} files:")
print(documents_with_body_having_tag)
```
%% Output
Files containing rare tags in preamble section:
Processing XML files: 100%|██████████| 1245/1245 [00:00<00:00, 7438.20it/s]
term appears in 3 files:
['32018R1882', '32019R1685', '32018R0329']
%% Cell type:code id: tags:
``` python
# result, analysis_results = analyse_body_dataset_from_xml(xml_files)
# analysis_results['tag_counts']
```
%% Cell type:markdown id: tags:
## Articles extraction from xml and html files
%% Cell type:markdown id: tags:
### Extract unique conclusion formulas from XML files
These formulas are used to clean last article by removing the conclusion part
%% Cell type:code id: tags:
``` python
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
conclusion_formulas = set()
for xml_file in tqdm(xml_files, desc="Processing XML files for formulas"):
celex_id = os.path.splitext(os.path.basename(xml_file))[0]
try:
with open(xml_file, "r", encoding="utf-8") as f:
xml_content = f.read()
# Parse XML content
root = ET.fromstring(xml_content)
# Find all conclusion formulas using ElementTree
# Using {*} to match any namespace
formulas = root.findall(".//{*}formula[@name='conclusionsFormula']/{*}p")
for formula in formulas:
formula_text = formula.text.strip() if formula.text else ""
if formula_text:
conclusion_formulas.add(formula_text)
except ET.ParseError:
print(f"Skipping malformed XML for celex_id: {celex_id}")
continue
except Exception as e:
print(f"Error processing {celex_id}: {e}")
continue
print("Unique conclusion formulas found:")
for formula in sorted(conclusion_formulas):
print(f"\n- {formula}")
```
%% Output
Processing XML files for formulas: 100%|██████████| 1245/1245 [00:00<00:00, 3134.71it/s]
Unique conclusion formulas found:
- It shall apply from
- This Regulation shall be binding in its entirely and directly applicable in all Member States.
- This Regulation shall be binding in its entirety and directly applicable in all Member States.
- This Regulation shall be binding in its entirety and directly applicable in the Member States in accordance with the Treaties.
%% Cell type:code id: tags:
``` python
print("Documents with formula containing 'It shall apply from':")
for xml_file in tqdm(xml_files, desc="Checking XML files for target formula"):
celex_id = os.path.splitext(os.path.basename(xml_file))[0]
try:
with open(xml_file, "r", encoding="utf-8") as f:
xml_content = f.read()
root = ET.fromstring(xml_content)
# Find all <p> elements under formulas with name 'conclusionsFormula'
formulas = root.findall(".//{*}formula[@name='conclusionsFormula']/{*}p")
for formula in formulas:
formula_text = formula.text.strip() if formula.text else ""
if "It shall apply from" in formula_text:
print(celex_id)
break # stop checking further formulas for this document
except ET.ParseError:
print(f"Skipping malformed XML for celex_id: {celex_id}")
except Exception as e:
print(f"Error processing {celex_id}: {e}")
```
%% Output
Documents with formula containing 'It shall apply from':
Checking XML files for target formula: 21%|██ | 260/1245 [00:00<00:00, 2599.87it/s]
32020R1801
Checking XML files for target formula: 100%|██████████| 1245/1245 [00:00<00:00, 3638.80it/s]
%% Cell type:markdown id: tags:
removing edge case article that has wrong conclusion formula
%% Cell type:code id: tags:
``` python
# Remove document 32020R1801 from xml_files list
xml_files = [f for f in xml_files if "32020R1801" not in f]
```
%% Cell type:markdown id: tags:
### Extract articles from xml and html files
%% Cell type:code id: tags:
``` python
import json
from tqdm import tqdm
body = {}
body_missing = {}
count_found = 0
count_missing = 0
for xml_file in tqdm(xml_files, desc="Processing documents"):
celex_id = os.path.splitext(os.path.basename(xml_file))[0]
doc_akn_xml_path = xml_file
if not os.path.exists(doc_akn_xml_path):
body_missing[celex_id] = {'reason': 'XML file not found'}
continue
body_akn_xml = extract_document_part(doc_akn_xml_path, 'body')
if body_akn_xml is None:
body_missing[celex_id] = {'reason': 'Body not found in XML'}
continue
doc_html_path = os.path.join(html_docs_dir, f"{celex_id}.html")
if not os.path.exists(doc_html_path):
body_missing[celex_id] = {'reason': 'HTML file not found'}
continue
with open(doc_html_path, 'r') as file:
doc_html = file.read()
doc_html = doc_html.replace('\xa0', ' ') # replace non-breaking space with a space
html_articles = extract_body_from_html(doc_html)
if html_articles is None:
body_missing[celex_id] = {'reason': 'HTML articles not found'}
count_missing += 1
continue
# Extract articles from XML and store in dictionary
xml_articles = {}
root = ET.fromstring(body_akn_xml)
# Find all article elements
for article in root.findall('.//article'):
# Get article number from num element
num_elem = article.find('.//num')
if num_elem is not None and 'Article' in num_elem.text:
article_num = num_elem.text.replace('Article ', '')
# Convert article element to string to preserve full XML content
article_content = ET.tostring(article, encoding='unicode')
xml_articles[article_num] = article_content
# Process each HTML article and match with XML
articles_matched = {}
# First verify that number of articles matches between HTML and XML
if html_articles and len(html_articles) != len(xml_articles):
body_missing[celex_id] = {
'reason': f'Mismatched article count - HTML: {len(html_articles)}, XML: {len(xml_articles)}'
}
count_missing += 1
continue
for html_article in html_articles:
# Extract article number from HTML
article_num_match = re.search(r'Article (\d+)', html_article)
if article_num_match:
article_num = article_num_match.group(1)
# Verify this article exists in XML
if article_num not in xml_articles:
body_missing[celex_id] = {
'reason': f'Article {article_num} found in HTML but missing from XML'
}
count_missing += 1
continue
# Remove tags from HTML content
tags_to_remove = ['mark']
cleaned_html = html_article
for tag in tags_to_remove:
cleaned_html = cleaned_html.replace(f'<{tag}>', '').replace(f'</{tag}>', '')
# Store both HTML and XML versions
articles_matched[article_num] = {
'html': cleaned_html,
'xml': xml_articles[article_num]
}
# Verify all articles were matched
if len(articles_matched) == len(xml_articles):
body[celex_id] = {
'articles': articles_matched,
'full_xml': body_akn_xml
}
count_found += 1
else:
body_missing[celex_id] = {
'reason': f'Not all articles matched - Matched: {len(articles_matched)}, Expected: {len(xml_articles)}',
'html_content_sample': doc_html[:100]
}
count_missing += 1
print(f"Found articles: {count_found}")
print(f"Not found articles: {count_missing}")
```
%% Output
Processing documents: 100%|██████████| 1244/1244 [00:00<00:00, 1534.07it/s]
Found articles: 1227
Not found articles: 15
%% Cell type:markdown id: tags:
## Preface dataset filtering and sorting based on content similarity between content of xml and html files
%% Cell type:code id: tags:
``` python
!pip install -q lxml tqdm beautifulsoup4 scikit-learn
```
%% Output
[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: pip install --upgrade pip
%% Cell type:code id: tags:
``` python
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import re
```
%% Cell type:code id: tags:
``` python
def add_space_before_tags(xml_content):
"""Add a space before tags that are directly adjacent to text."""
# Use regex to find tags that are directly adjacent to text and add a space before them
return re.sub(r'(?<=\w)(<[^>]+>)', r' \1', xml_content)
def extract_text_from_soup(soup):
"""Extract clean text from a BeautifulSoup object."""
return soup.get_text()
def normalize_text(text):
"""Normalize text by removing extra whitespace and converting text to lowercase."""
text = re.sub(r'\s+', ' ', text).strip()
return text.lower()
def combined_similarity(text1, text2, weights=None):
"""Calculate a combined similarity score using multiple methods."""
if weights is None:
# Default equal weights for each similarity measure
weights = {
'levenshtein': 0.33,
'cosine': 0.33,
'jaccard': 0.34
}
# Calculate individual similarity scores
levenshtein_score = calculate_levenshtein_score(text1, text2)
cosine_score = calculate_cosine_similarity(text1, text2)
jaccard_score = jaccard_similarity(text1, text2)
# Combine scores using the specified weights
combined_score = (
weights['levenshtein'] * levenshtein_score +
weights['cosine'] * cosine_score +
weights['jaccard'] * jaccard_score
)
return combined_score
def process_document(celex_id, document):
processed_articles = []
# Get the articles dictionary from the document; each article already has 'html' and 'xml' keys.
articles = document.get('articles', {})
# Iterate over articles in sorted order (assuming keys are numeric strings)
for key in sorted(articles, key=lambda k: int(k)):
article = articles[key]
html_article = article.get('html', '')
xml_article_str = article.get('xml', '')
# Extract and normalize text from the HTML content
html_text = normalize_text(extract_text_from_soup(BeautifulSoup(html_article, 'html.parser')))
# Extract and normalize text from the XML content
xml_text = normalize_text(extract_text_from_soup(BeautifulSoup(xml_article_str, 'xml')))
# Compute similarity between the normalized texts
similarity = combined_similarity(html_text, xml_text)
# Append tuple with celex_id, HTML content, XML content, and similarity score
processed_articles.append((celex_id, html_article, xml_article_str, similarity))
return processed_articles
def create_articles_dataframe(documents_bodies):
"""Create a DataFrame from processed documents, where each row represents a single article.
Expects documents_bodies to be a dictionary mapping celex_id to a dictionary with keys:
- 'full_xml': complete XML content as a string.
- 'articles': list of HTML strings, each representing an article.
Returns a DataFrame with columns: celex_id, html, xml, similarity.
"""
data = []
for celex_id, document in tqdm(documents_bodies.items(), desc="Processing documents"):
articles_data = process_document(celex_id, document)
data.extend(articles_data)
df = pd.DataFrame(data, columns=['celex_id', 'html', 'xml', 'similarity'])
# Extract article IDs from XML
def extract_article_id(xml_str):
try:
soup = BeautifulSoup(xml_str, 'xml')
article = soup.find('article')
if article and 'eId' in article.attrs:
return article['eId']
return None
except:
return None
# Create unique ID combining celex_id and article ID
df['article_id'] = df['xml'].apply(extract_article_id)
df['id'] = df['celex_id'] + '_' + df['article_id']
df.set_index('id', inplace=True)
return df
```
%% Cell type:code id: tags:
``` python
# Create DataFrame
df_articles = create_articles_dataframe(body)
```
%% Output
Processing documents: 100%|██████████| 1227/1227 [00:08<00:00, 152.38it/s]
%% Cell type:code id: tags:
``` python
df_articles
```
%% Output
celex_id \
id
32017R0185_art_1 32017R0185
32017R0185_art_2 32017R0185
32017R0185_art_3 32017R0185
32017R0185_art_4 32017R0185
32017R0185_art_5 32017R0185
... ...
32016R1334_art_2 32016R1334
32016R1687_art_1 32016R1687
32016R1687_art_2 32016R1687
32017R0490_art_1 32017R0490
32017R0490_art_2 32017R0490
html \
id
32017R0185_art_1 <p><em>Article 1</em></p>\n<p><em>Subject matt...
32017R0185_art_2 <p><em>Article 2</em></p>\n<p><em>Derogation c...
32017R0185_art_3 <p><em>Article 3</em></p>\n<p><em>Derogation c...
32017R0185_art_4 <p><em>Article 4</em></p>\n<p><em>Derogation c...
32017R0185_art_5 <p><em>Article 5</em></p>\n<p><em>Entry into f...
... ...
32016R1334_art_2 <p>Article 2</p>\n<p>This Regulation shall ent...
32016R1687_art_1 <p>Article 1</p>\n<p>Annex III to Regulation (...
32016R1687_art_2 <p>Article 2</p>\n<p>This Regulation shall ent...
32017R0490_art_1 <p>Article 1</p>\n<p>Annex I to Regulation (EU...
32017R0490_art_2 <p>Article 2</p>\n<p>This Regulation shall ent...
xml \
id
32017R0185_art_1 <article eId="art_1" GUID="001">\n <num...
32017R0185_art_2 <article eId="art_2" GUID="002">\n <num...
32017R0185_art_3 <article eId="art_3" GUID="003">\n <num...
32017R0185_art_4 <article eId="art_4" GUID="004">\n <num...
32017R0185_art_5 <article eId="art_5" GUID="005">\n <num...
... ...
32016R1334_art_2 <article eId="art_2" GUID="002">\n <num...
32016R1687_art_1 <article eId="art_1" GUID="001">\n <num...
32016R1687_art_2 <article eId="art_2" GUID="002">\n <num...
32017R0490_art_1 <article eId="art_1" GUID="001">\n <num...
32017R0490_art_2 <article eId="art_2" GUID="002">\n <num...
similarity article_id
id
32017R0185_art_1 0.895940 art_1
32017R0185_art_2 0.905612 art_2
32017R0185_art_3 1.000000 art_3
32017R0185_art_4 1.000000 art_4
32017R0185_art_5 0.865238 art_5
... ... ...
32016R1334_art_2 0.942471 art_2
32016R1687_art_1 1.000000 art_1
32016R1687_art_2 0.476332 art_2
32017R0490_art_1 1.000000 art_1
32017R0490_art_2 0.942471 art_2
[2938 rows x 5 columns]
%% Cell type:code id: tags:
``` python
print(f"Average similarity: {df_articles['similarity'].mean()}")
```
%% Output
Average similarity: 0.9239921332656981
%% Cell type:markdown id: tags:
### Sort prefaces by similarity in descending order
%% Cell type:code id: tags:
``` python
# Sort DataFrame by similarity in descending order
df_articles_sorted = df_articles.sort_values(by='similarity', ascending=False)
# Print sorted results
print("Top 2 documents with highest similarity:")
print(df_articles_sorted.head(2)[['similarity']])
print("\nBottom 2 documents with lowest similarity:")
print(df_articles_sorted.tail(4)[['similarity']])
print(f"\nAverage similarity: {df_articles['similarity'].mean():.4f}")
```
%% Output
Top 2 documents with highest similarity:
similarity
id
32020R1641_art_3 1.0
32017R1110_art_5 1.0
Bottom 2 documents with lowest similarity:
similarity
id
32018R1935_art_2 0.426355
32019R0985_art_2 0.419034
32020R0182_art_6 0.350657
32016R1737_art_2 0.167723
Average similarity: 0.9240
%% Cell type:markdown id: tags:
## Saving data
%% Cell type:markdown id: tags:
### save sorted dataset
%% Cell type:code id: tags:
``` python
df_articles_sorted
```
%% Output
celex_id \
id
32020R1641_art_3 32020R1641
32017R1110_art_5 32017R1110
32016R2075_art_1 32016R2075
32016R2148_art_3 32016R2148
32020R0501_art_4 32020R0501
... ...
32016R0983_art_1 32016R0983
32018R1935_art_2 32018R1935
32019R0985_art_2 32019R0985
32020R0182_art_6 32020R0182
32016R1737_art_2 32016R1737
html \
id
32020R1641_art_3 <p>Article 3</p>\n<p><strong>Equivalence</stro...
32017R1110_art_5 <p>Article 5</p>\n<p>Notification of changes t...
32016R2075_art_1 <p>Article 1</p>\n<p>The maximum number of day...
32016R2148_art_3 <p>Article 3</p>\n<p>Importers who have alread...
32020R0501_art_4 <p>Article 4</p>\n<p>By way of derogation from...
... ...
32016R0983_art_1 <p>Article 1</p>\n<p>Council Regulation (EC) N...
32018R1935_art_2 <p>Article 2</p>\n<p>This Regulation shall ent...
32019R0985_art_2 <p>Article 2</p>\n<p>This Regulation shall ent...
32020R0182_art_6 <p>Article 6</p>\n<p>This Regulation shall ent...
32016R1737_art_2 <p>Article 2</p>\n<p>This Regulation shall ent...
xml \
id
32020R1641_art_3 <article eId="art_3" GUID="003">\n <num...
32017R1110_art_5 <article eId="art_5" GUID="005">\n <num...
32016R2075_art_1 <article eId="art_1" GUID="001">\n <num...
32016R2148_art_3 <article eId="art_3" GUID="003">\n <num...
32020R0501_art_4 <article eId="art_4" GUID="004">\n <num...
... ...
32016R0983_art_1 <article eId="art_1" GUID="001">\n <num...
32018R1935_art_2 <article eId="art_2" GUID="002">\n <num...
32019R0985_art_2 <article eId="art_2" GUID="002">\n <num...
32020R0182_art_6 <article eId="art_6" GUID="006">\n <num...
32016R1737_art_2 <article eId="art_2" GUID="002">\n <num...
similarity article_id
id
32020R1641_art_3 1.000000 art_3
32017R1110_art_5 1.000000 art_5
32016R2075_art_1 1.000000 art_1
32016R2148_art_3 1.000000 art_3
32020R0501_art_4 1.000000 art_4
... ... ...
32016R0983_art_1 0.439238 art_1
32018R1935_art_2 0.426355 art_2
32019R0985_art_2 0.419034 art_2
32020R0182_art_6 0.350657 art_6
32016R1737_art_2 0.167723 art_2
[2938 rows x 5 columns]
%% Cell type:code id: tags:
``` python
import os
import json
# Create directory if it does not exist
output_dir = f'{DATA_DIR}/articles/datasets/html'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Add document id with text and xml content
df_articles_sorted = df_articles_sorted.reset_index()[['celex_id', 'xml', 'html', 'similarity']]
# Save dataset
dataset_file = os.path.join(output_dir, 'dataset.json')
df_articles_sorted.to_json(dataset_file, orient='records', indent=2)
# Create metadata file
metadata = {
'date': datetime.now().strftime('%Y-%m-%d'),
'dataset_type': 'articles',
'dataset_name': 'articles_html',
'num_articles': len(df_articles_sorted),
'columns': list(df_articles_sorted.columns),
'avg_similarity': float(df_articles_sorted['similarity'].mean()),
'min_similarity': float(df_articles_sorted['similarity'].min()),
'max_similarity': float(df_articles_sorted['similarity'].max())
}
metadata_file = os.path.join(output_dir, 'metadata.json')
with open(metadata_file, 'w') as f:
json.dump(metadata, f, indent=2)
```
%% Cell type:markdown id: tags:
### Save sorted dataset to S3
%% Cell type:code id: tags:
``` python
from dotenv import load_dotenv
import os
import json
import re
import sys
import dspy
current_dir = os.getcwd()
parent_dir = f"{os.path.dirname(current_dir)}"
sys.path.append(f"{parent_dir}")
from functions import *
from xml_util import *
load_dotenv()
```
%% Output
True
%% Cell type:code id: tags:
``` python
from dotenv import load_dotenv
load_dotenv()
```
%% Cell type:code id: tags:
``` python
# S3 bucket and subfolder details
bucket_name = "ai4xml-data"
subfolder = "training/" # Specify your subfolder path with trailing slash
dataset_path = os.path.join(output_dir, 'dataset.json')
# Upload dataset to S3 subfolder by prepending the subfolder path
s3_path = subfolder + "prefaces_dataset.json"
upload_success = upload_to_s3(
dataset_path,
bucket_name,
s3_path,
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY')
)
if upload_success:
print(f"Dataset successfully uploaded to s3://{bucket_name}/{s3_path}")
else:
print("Failed to upload dataset to S3")
```
%% Output
Upload Successful: training/prefaces_dataset.json
Dataset successfully uploaded to s3://ai4xml-data/training/prefaces_dataset.json
%% Cell type:markdown id: tags:
# Analysis about dataset??
%% Cell type:code id: tags:
``` python
import pandas as pd
import os
output_dir = f'{DATA_DIR}/articles/datasets/html'
# Read the dataset that was saved earlier
dataset_file = os.path.join(output_dir, 'dataset.json')
df_articles_sorted = pd.read_json(dataset_file)
print(f"Total records: {len(df_articles_sorted)}")
```
%% Output
Total records: 2938
%% Cell type:code id: tags:
``` python
# Analyze first 1000 documents
doc_tags, all_unique_tags = analyze_tags_in_k_rows(df_articles_sorted, k=len(df_articles_sorted))
```
%% Output
Unique XML tags and their document counts from the first 2938 documents:
- alinea: 1227 document(s)
- article: 1227 document(s)
- authorialNote: 191 document(s)
- content: 1227 document(s)
- date: 428 document(s)
- def: 10 document(s)
- defBody: 7 document(s)
- heading: 181 document(s)
- intro: 69 document(s)
- list: 69 document(s)
- mod: 35 document(s)
- num: 1227 document(s)
- p: 1227 document(s)
- paragraph: 151 document(s)
- point: 69 document(s)
- quantity: 20 document(s)
- quotedStructure: 2 document(s)
- ref: 189 document(s)
- span: 1216 document(s)
- table: 8 document(s)
- td: 8 document(s)
- term: 3 document(s)
- tr: 8 document(s)
Total number of unique tags in first 2938 documents: 23
%% Cell type:code id: tags:
``` python
# Find documents containing 'authorialNote' tag
search_tag = 'table'
docs_with_search_tag = [id for id, tags in doc_tags.items() if search_tag in tags]
docs_with_search_tag
```
%% Output
['32019R0952', '32019R1926', '32018R0076', '32019R2219']
%% Cell type:markdown id: tags:
## Extract articles unique tags
%% Cell type:code id: tags:
``` python
import xml.etree.ElementTree as ET
def extract_tags(xml_string):
try:
root = ET.fromstring(xml_string)
return [elem.tag for elem in root.iter()]
except ET.ParseError:
return []
df_articles_sorted['tags'] = df_articles_sorted['xml'].apply(extract_tags)
```
%% Cell type:code id: tags:
``` python
df_articles_sorted.head()
```
%% Output
celex_id xml \
0 32020R1641 <article eId="art_3" GUID="003">\n <num...
1 32017R1110 <article eId="art_5" GUID="005">\n <num...
2 32016R2075 <article eId="art_1" GUID="001">\n <num...
3 32016R2148 <article eId="art_3" GUID="003">\n <num...
4 32020R0501 <article eId="art_4" GUID="004">\n <num...
html similarity \
0 <p>Article 3</p>\n<p><strong>Equivalence</stro... 1.0
1 <p>Article 5</p>\n<p>Notification of changes t... 1.0
2 <p>Article 1</p>\n<p>The maximum number of day... 1.0
3 <p>Article 3</p>\n<p>Importers who have alread... 1.0
4 <p>Article 4</p>\n<p>By way of derogation from... 1.0
tags
0 [article, num, heading, alinea, content, p]
1 [article, num, heading, paragraph, num, alinea...
2 [article, num, alinea, content, p]
3 [article, num, alinea, content, p]
4 [article, num, alinea, content, p, mod]
%% Cell type:code id: tags:
``` python
ignored_tags = ['mod']
# remove articles that ONLY contain ignored tags
df_articles_sorted['tags'] = df_articles_sorted['tags'].apply(lambda tags: [tag for tag in tags if tag not in ignored_tags])
df_articles_sorted = df_articles_sorted[df_articles_sorted['tags'].apply(lambda tags: len(tags) > 0)]
```
%% Cell type:markdown id: tags:
# Creating a balanced dataset
%% Cell type:code id: tags:
``` python
df_articles_sorted['tag_label'] = df_articles_sorted['tags'].apply(lambda tags: '-'.join(sorted(set(tags))))
```
%% Cell type:code id: tags:
``` python
df_articles_sorted
```
%% Output
celex_id xml \
0 32020R1641 <article eId="art_3" GUID="003">\n <num...
1 32017R1110 <article eId="art_5" GUID="005">\n <num...
2 32016R2075 <article eId="art_1" GUID="001">\n <num...
3 32016R2148 <article eId="art_3" GUID="003">\n <num...
4 32020R0501 <article eId="art_4" GUID="004">\n <num...
... ... ...
2933 32016R0983 <article eId="art_1" GUID="001">\n <num...
2934 32018R1935 <article eId="art_2" GUID="002">\n <num...
2935 32019R0985 <article eId="art_2" GUID="002">\n <num...
2936 32020R0182 <article eId="art_6" GUID="006">\n <num...
2937 32016R1737 <article eId="art_2" GUID="002">\n <num...
html similarity \
0 <p>Article 3</p>\n<p><strong>Equivalence</stro... 1.000000
1 <p>Article 5</p>\n<p>Notification of changes t... 1.000000
2 <p>Article 1</p>\n<p>The maximum number of day... 1.000000
3 <p>Article 3</p>\n<p>Importers who have alread... 1.000000
4 <p>Article 4</p>\n<p>By way of derogation from... 1.000000
... ... ...
2933 <p>Article 1</p>\n<p>Council Regulation (EC) N... 0.439238
2934 <p>Article 2</p>\n<p>This Regulation shall ent... 0.426355
2935 <p>Article 2</p>\n<p>This Regulation shall ent... 0.419034
2936 <p>Article 6</p>\n<p>This Regulation shall ent... 0.350657
2937 <p>Article 2</p>\n<p>This Regulation shall ent... 0.167723
tags \
0 [article, num, heading, alinea, content, p]
1 [article, num, heading, paragraph, num, alinea...
2 [article, num, alinea, content, p]
3 [article, num, alinea, content, p]
4 [article, num, alinea, content, p]
... ...
2933 [article, num, alinea, content, p, authorialNo...
2934 [article, num, alinea, content, p, date, aline...
2935 [article, num, alinea, content, p, span]
2936 [article, num, alinea, content, p, span]
2937 [article, num, alinea, content, p, span]
single_tag \
0 ['article', 'num', 'heading', 'alinea', 'conte...
1 ['article', 'num', 'heading', 'paragraph', 'nu...
2 ['article', 'num', 'alinea', 'content', 'p']
3 ['article', 'num', 'alinea', 'content', 'p']
4 ['article', 'num', 'alinea', 'content', 'p']
... ...
2933 ['article', 'num', 'alinea', 'content', 'p', '...
2934 ['article', 'num', 'alinea', 'content', 'p', '...
2935 ['article', 'num', 'alinea', 'content', 'p', '...
2936 ['article', 'num', 'alinea', 'content', 'p', '...
2937 ['article', 'num', 'alinea', 'content', 'p', '...
tag_label
0 alinea-article-content-heading-num-p
1 alinea-article-content-heading-num-p-paragraph
2 alinea-article-content-num-p
3 alinea-article-content-num-p
4 alinea-article-content-num-p
... ...
2933 alinea-article-authorialNote-content-date-num-...
2934 alinea-article-content-date-num-p
2935 alinea-article-content-num-p-span
2936 alinea-article-content-num-p-span
2937 alinea-article-content-num-p-span
[2938 rows x 7 columns]
%% Cell type:code id: tags:
``` python
# Find tag labels that appear only once (required for stratified split)
tag_label_counts = df_articles_sorted['tag_label'].value_counts()
single_occurrence_labels = tag_label_counts[tag_label_counts == 1].index
# Remove rows with tags that only appear once
df_articles_sorted = df_articles_sorted[~df_articles_sorted['tag_label'].isin(single_occurrence_labels)]
print(f"Removed {len(single_occurrence_labels)} documents with unique tag combinations")
```
%% Output
Removed 35 documents with unique tag combinations
%% Cell type:code id: tags:
``` python
plot_tag_distribution(df_articles_sorted)
```
%% Output
%% Cell type:code id: tags:
``` python
from sklearn.model_selection import train_test_split
# Split into train and temp (80%) and test (20%)
train_val_df, test_df = train_test_split(df_articles_sorted, test_size=0.2, random_state=42, stratify=df_articles_sorted['tag_label'])
# Split train and validation from train_val (80% -> 20% train, 80% validation)
# DSPy suggest to start with this configuration, https://dspy.ai/learn/optimization/overview/?h=20%25
train_df, val_df = train_test_split(train_val_df, test_size=0.8, random_state=42, stratify=train_val_df['tag_label'])
# Output results
print(f"Train Set Size: {len(train_df)}")
print(f"Validation Set Size: {len(val_df)}")
print(f"Test Set Size: {len(test_df)}")
```
%% Output
Train Set Size: 464
Validation Set Size: 1858
Test Set Size: 581
%% Cell type:code id: tags:
``` python
# Inspect tag distribution across splits
def check_tag_distribution(split_name, split_df):
tags_in_split = [tag for tags in split_df['tags'] for tag in tags]
print(f"Tag Distribution in {split_name}: {Counter(tags_in_split)}")
check_tag_distribution("Train", train_df)
check_tag_distribution("Validation", val_df)
check_tag_distribution("Test", test_df)
```
%% Output
Tag Distribution in Train: Counter({'p': 632, 'content': 588, 'num': 570, 'alinea': 552, 'article': 464, 'span': 226, 'date': 121, 'heading': 89, 'paragraph': 69, 'point': 39, 'authorialNote': 31, 'ref': 31, 'intro': 13, 'list': 12, 'mod': 8, 'defBody': 8, 'def': 6, 'quantity': 2})
Tag Distribution in Validation: Counter({'p': 2598, 'content': 2392, 'num': 2273, 'alinea': 2270, 'article': 1858, 'span': 924, 'date': 482, 'heading': 356, 'paragraph': 298, 'point': 131, 'ref': 121, 'authorialNote': 116, 'intro': 47, 'list': 40, 'td': 35, 'mod': 26, 'quantity': 14, 'tr': 14, 'table': 7, 'def': 6, 'defBody': 6})
Tag Distribution in Test: Counter({'p': 829, 'content': 754, 'num': 714, 'alinea': 710, 'article': 581, 'span': 283, 'date': 143, 'heading': 109, 'paragraph': 99, 'point': 47, 'authorialNote': 38, 'ref': 38, 'td': 18, 'intro': 17, 'list': 14, 'mod': 7, 'tr': 6, 'quantity': 4, 'def': 3, 'defBody': 3, 'table': 2})
%% Cell type:code id: tags:
``` python
doc_tags, all_unique_tags = analyze_tags_in_k_rows(test_df, k=100)
```
%% Output
All unique XML tags found in first 100 documents:
- alinea
- article
- authorialNote
- content
- date
- heading
- intro
- list
- num
- p
- paragraph
- point
- quantity
- ref
- span
- table
- td
- tr
Total number of unique tags in first 100 documents: 18
%% Cell type:code id: tags:
``` python
# Get first 500 documents
docs_subset = list(doc_tags.items())[:100]
# Count frequency of each tag
tag_frequency = defaultdict(int)
for _, tags in docs_subset:
for tag in tags:
tag_frequency[tag] += 1
# Sort tags by frequency in descending order
sorted_tags = sorted(tag_frequency.items(), key=lambda x: x[1], reverse=True)
print("Tag distribution in first 500 documents:")
print("\nTag | Frequency | % of Documents")
print("-" * 40)
for tag, freq in sorted_tags:
percentage = (freq / 500) * 100
print(f"{tag:<20} {freq:>5} {percentage:>10.1f}%")
print(f"\nTotal unique tags in first 500 documents: {len(tag_frequency)}")
```
%% Output
Tag distribution in first 500 documents:
Tag | Frequency | % of Documents
----------------------------------------
content 98 19.6%
article 98 19.6%
p 98 19.6%
num 98 19.6%
alinea 97 19.4%
span 47 9.4%
heading 20 4.0%
date 16 3.2%
paragraph 8 1.6%
authorialNote 7 1.4%
ref 7 1.4%
list 4 0.8%
point 4 0.8%
intro 4 0.8%
table 1 0.2%
tr 1 0.2%
td 1 0.2%
quantity 1 0.2%
Total unique tags in first 500 documents: 18
%% Cell type:code id: tags:
``` python
# Get tag statistics for train_df
tag_counts = Counter([tag for tags in train_df['tags'] for tag in tags])
total_tags = sum(tag_counts.values())
print("\nDetailed Tag Statistics in Train Set:")
print("-" * 50)
print(f"Total number of tags: {total_tags}")
print("\nTag frequencies:")
for tag, count in tag_counts.most_common():
percentage = (count / total_tags) * 100
print(f"{tag}: {count} ({percentage:.2f}%)")
```
%% Output
Detailed Tag Statistics in Train Set:
--------------------------------------------------
Total number of tags: 3461
Tag frequencies:
p: 632 (18.26%)
content: 588 (16.99%)
num: 570 (16.47%)
alinea: 552 (15.95%)
article: 464 (13.41%)
span: 226 (6.53%)
date: 121 (3.50%)
heading: 89 (2.57%)
paragraph: 69 (1.99%)
point: 39 (1.13%)
authorialNote: 31 (0.90%)
ref: 31 (0.90%)
intro: 13 (0.38%)
list: 12 (0.35%)
mod: 8 (0.23%)
defBody: 8 (0.23%)
def: 6 (0.17%)
quantity: 2 (0.06%)
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
# Create stratified splits
%% Cell type:markdown id: tags:
---------*******-------
%% Cell type:code id: tags:
``` python
# Add new column with length of xml content
# Add column with xml length
df_articles_sorted['xml_length'] = df_articles_sorted['xml'].str.len()
# Display row with longest xml
longest_xml = df_articles_sorted[:1000].nlargest(5, 'xml_length')
longest_xml
```
%% Output
celex_id xml \
938 32020R0533 <article eId="art_1" GUID="001">\n <num...
914 32016R0911 <article eId="art_2" GUID="002">\n <num...
912 32020R1546 <article eId="art_1" GUID="001">\n <num...
867 32018R0076 <article eId="art_1" GUID="001">\n <num...
653 32017R1272 <article eId="art_1" GUID="001">\n <num...
html similarity \
938 <h1 id="article-1"><em>Article 1</em></h1>\n<p... 0.980638
914 <p>Article 2<br />\nTerms to be disclosed</p>\... 0.992917
912 <p>Article 1</p>\n<p><strong>Inventory structu... 0.993462
867 <p>Article 1</p>\n<p>(1) The fishing opportuni... 1.000000
653 <p>Article 1</p>\n<p>1. The annual national ce... 1.000000
tags xml_length
938 [article, num, heading, paragraph, num, alinea... 5511
914 [article, num, heading, paragraph, num, list, ... 5457
912 [article, num, heading, paragraph, num, alinea... 4774
867 [article, num, paragraph, num, list, intro, p,... 3813
653 [article, num, paragraph, num, alinea, content... 3788
%% Cell type:code id: tags:
``` python
train_df, val_df, test_df = create_stratified_splits(df_articles_sorted[:1000], test_size=0.33, val_size=0.33)
```
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
import numpy as np
# Create figure
plt.figure(figsize=(15,6))
# Get tag distributions for each dataset
train_tags, _ = analyze_tags_in_k_rows(train_df, k=len(train_df))
val_tags, _ = analyze_tags_in_k_rows(val_df, k=len(val_df))
test_tags, _ = analyze_tags_in_k_rows(test_df, k=len(test_df))
# Get data for each set
train_tags, train_counts = plot_tag_distribution(train_tags, 'Training Set')
val_tags, val_counts = plot_tag_distribution(val_tags, 'Validation Set')
test_tags, test_counts = plot_tag_distribution(test_tags, 'Test Set')
# Get union of all tags to ensure consistent x-axis
all_tags = list(set(train_tags + val_tags + test_tags))
all_tags.sort()
# Create dictionaries mapping tags to counts, defaulting to 0 for missing tags
train_dict = dict(zip(train_tags, train_counts))
val_dict = dict(zip(val_tags, val_counts))
test_dict = dict(zip(test_tags, test_counts))
# Get counts in consistent order
train_counts_aligned = [train_dict.get(tag, 0) for tag in all_tags]
val_counts_aligned = [val_dict.get(tag, 0) for tag in all_tags]
test_counts_aligned = [test_dict.get(tag, 0) for tag in all_tags]
# Plot aligned distributions
width = 0.25
x = np.arange(len(all_tags))
train_bars = plt.bar(x - width, train_counts_aligned, width, label='Training Set')
val_bars = plt.bar(x, val_counts_aligned, width, label='Validation Set')
test_bars = plt.bar(x + width, test_counts_aligned, width, label='Test Set')
# Add value labels on the bars
for bars in [train_bars, val_bars, test_bars]:
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height)}',
ha='center', va='bottom')
plt.xticks(x, all_tags, rotation=45, ha='right')
plt.title('Distribution of XML Tags Across Datasets')
plt.xlabel('Tag Name')
plt.ylabel('Number of Documents')
plt.legend()
plt.tight_layout()
plt.show()
```
%% Output
Unique XML tags and their document counts from the first 342 documents:
- alinea: 295 document(s)
- article: 302 document(s)
- content: 302 document(s)
- date: 18 document(s)
- def: 1 document(s)
- heading: 76 document(s)
- intro: 13 document(s)
- list: 13 document(s)
- mod: 13 document(s)
- num: 302 document(s)
- p: 302 document(s)
- paragraph: 31 document(s)
- point: 13 document(s)
- quantity: 4 document(s)
- ref: 1 document(s)
- span: 3 document(s)
- table: 3 document(s)
- td: 3 document(s)
- term: 1 document(s)
- tr: 3 document(s)
Total number of unique tags in first 342 documents: 20
Unique XML tags and their document counts from the first 328 documents:
- alinea: 280 document(s)
- article: 282 document(s)
- content: 282 document(s)
- date: 18 document(s)
- def: 1 document(s)
- heading: 69 document(s)
- intro: 12 document(s)
- list: 12 document(s)
- mod: 11 document(s)
- num: 282 document(s)
- p: 282 document(s)
- paragraph: 27 document(s)
- point: 12 document(s)
- quantity: 2 document(s)
- ref: 2 document(s)
- span: 2 document(s)
- term: 1 document(s)
Total number of unique tags in first 328 documents: 17
Unique XML tags and their document counts from the first 330 documents:
- alinea: 290 document(s)
- article: 295 document(s)
- content: 294 document(s)
- date: 17 document(s)
- def: 2 document(s)
- defBody: 1 document(s)
- heading: 77 document(s)
- intro: 14 document(s)
- list: 14 document(s)
- mod: 12 document(s)
- num: 295 document(s)
- p: 295 document(s)
- paragraph: 27 document(s)
- point: 14 document(s)
- quantity: 2 document(s)
- ref: 1 document(s)
- span: 2 document(s)
- table: 1 document(s)
- td: 1 document(s)
- term: 1 document(s)
- tr: 1 document(s)
Total number of unique tags in first 330 documents: 21
%% Cell type:code id: tags:
``` python
longest_xml = test_df[:1000].nlargest(5, 'xml_length')
longest_xml
```
%% Output
celex_id xml \
6 32020R0533 <article eId="art_1" GUID="001">\n <num...
307 32020R0750 <article eId="art_1" GUID="001">\n <num...
108 32018R0891 <article eId="art_1" GUID="001">\n <num...
191 32018R2018 <article eId="art_3" GUID="003">\n <num...
118 32018R1095 <article eId="art_1" GUID="001">\n <num...
html similarity \
6 <h1 id="article-1"><em>Article 1</em></h1>\n<p... 0.980638
307 <p>Article 1</p>\n<p>1. By way of derogation f... 0.968563
108 <p>Article 1</p>\n<p>The annual national ceili... 1.000000
191 <p>Article 3</p>\n<p><strong>Content of the te... 0.985714
118 <p>Article 1</p>\n<p>1. The fishing opportunit... 1.000000
tags xml_length
6 [article, num, heading, paragraph, num, alinea... 5511
307 [article, num, paragraph, num, alinea, content... 3147
108 [article, num, alinea, content, p, alinea, con... 2780
191 [article, num, heading, list, intro, p, point,... 2697
118 [article, num, paragraph, num, list, intro, p,... 2529
%% Cell type:code id: tags:
``` python
print(f"length of train_df: {len(train_df)}")
print(f"length of val_df: {len(val_df)}")
print(f"length of test_df: {len(test_df)}")
```
%% Output
length of train_df: 342
length of val_df: 328
length of test_df: 330
%% Cell type:code id: tags:
``` python
# Create directories if they don't exist
import os
os.makedirs(f'{DATA_DIR}/articles/datasets/html/stratified', exist_ok=True)
# Save each dataset to a JSON file in the html subdirectory
train_df.to_json(f'{DATA_DIR}/articles/datasets/html/stratified/train.json', orient='records', indent=2)
val_df.to_json(f'{DATA_DIR}/articles/datasets/html/stratified/val.json', orient='records', indent=2)
test_df.to_json(f'{DATA_DIR}/articles/datasets/html/stratified/test.json', orient='records', indent=2)
```
......
......@@ -275,6 +275,82 @@ def extract_preamble_from_text(content, file_path):
}
}
def extract_preamble_from_html(content, file_path):
celex_id = os.path.basename(file_path).split('.')[0]
# Remove <mark> tags and their content
content = re.sub(r'<mark>([^<]*(?:<(?!/mark>)[^<]*)*)</mark>', r'\1', content)
# Pattern to capture the preamble components in HTML
pattern = r"((?:<p>THE EUROPEAN COMMISSION,|<p>THE COUNCIL OF THE EUROPEAN UNION,|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION|<p>THE EUROPEAN COMMISSION|<p>THE GOVERNING COUNCIL OF THE EUROPEAN CENTRAL BANK,|<p>THE COUNCIL OF THE EUROPEAN UNION|<p>THE COUNCIL OF THE OPEAN UNION,|<p>THE EUROPEAN PARLIAMENT,|<p>THE EUROPEAN COMMUNITIES,|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL,|<p>The GOVERNING COUNCIL OF THE EUROPEAN CENTRAL BANK,|<p>THE MANAGEMENT BOARD,).*?)(?=<p>HAS ADOPTED THIS REGULATION:|<p>HAVE ADOPTED THIS REGULATION:|<p>HAS ADOPTED THIS DIRECTIVE:|<p>HAVE ADOPTED THIS DIRECTIVE:|<p>HAS ADOPTED THIS DECISION:|<p>HAVE ADOPTED THIS DECISION:|<p>HAS ADOPTED THE FOLLOWING REGULATION:)"
match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
if not match:
return {
"success": False,
"error": "No preamble found",
"file_path": file_path
}
preamble = match.group(1).strip()
# Extract formula (commission declaration) using a separate function
formula = extract_formula_from_html(preamble)
if not formula:
return {
"success": False,
"error": "No formula found",
"file_path": file_path
}
# Extract citations and recitals using a separate function
citations_part, recitals_part = extract_citations_and_recitals_from_html(preamble)
citations = extract_citations_from_html(citations_part)
if citations and citations[0] == formula:
citations = citations[1:] # Remove the formula from citations
else:
return {
"success": False,
"error": "Formula not found as first citation",
"file_path": file_path,
"formula": formula,
"citations": citations
}
recitals_result = extract_recitals_from_html(recitals_part, celex_id)
# Check if there are errors in recitals extraction
if recitals_result["errors"]:
return {
"success": False,
"error": "Errors in recitals extraction",
"file_path": file_path,
"recitals_errors": recitals_result["errors"]
}
# Extract preamble final using a separate function
preamble_final = extract_preamble_final_from_html(content)
# Extract footnotes
footnotes = extract_footnotes_from_html(content)
return {
"success": True,
"data": {
"formula": formula.strip(),
"citations": citations,
"recitals": recitals_result["recitals"],
"recitals_duplicates": recitals_result["duplicates"],
"preamble_final": preamble_final.strip(),
"footnotes": footnotes
}
}
def replace_footnotes_in_text(text, footnotes_dict):
"""Replace footnote references with their content in a list of text strings."""
footnote_numbers = re.findall(r'\[\^(\d+)\]', text)
......@@ -308,6 +384,26 @@ def extract_preamble_final(content):
return final_phrase_match.group(1) if final_phrase_match else ""
def extract_preamble_final_from_html(content):
import re
final_phrase_patterns = [
r"(<p>HAS ADOPTED THIS REGULATION:</p>)",
r"(<p>HAVE ADOPTED THIS REGULATION:</p>)",
r"(<p>HAS ADOPTED THIS DIRECTIVE:</p>)",
r"(<p>HAVE ADOPTED THIS DIRECTIVE:</p>)",
r"(<p>HAS ADOPTED THIS DECISION:</p>)",
r"(<p>HAVE ADOPTED THIS DECISION:</p>)",
r"(<p>HAS ADOPTED THE FOLLOWING REGULATION:</p>)",
]
for pattern in final_phrase_patterns:
final_phrase_match = re.search(pattern, content, re.IGNORECASE)
if final_phrase_match:
break
return final_phrase_match.group(1) if final_phrase_match else ""
def extract_formula(preamble):
import re
......@@ -317,6 +413,23 @@ def extract_formula(preamble):
return formula_match.group(1) if formula_match else None
def extract_formula_from_html(html_content):
import re
from bs4 import BeautifulSoup
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Define the pattern for extracting the formula with p tags
formula_pattern = r"(<p>THE EUROPEAN COMMISSION,</p>|<p>THE COUNCIL OF THE EUROPEAN UNION,</p>|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,</p>|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,</p>|<p>THE EUROPEAN COMMISSION,</p>|<p>THE GOVERNING COUNCIL OF THE EUROPEAN CENTRAL BANK,</p>|<p>THE COUNCIL OF THE EUROPEAN UNION,</p>|<p>THE COUNCIL OF THE OPEAN UNION,</p>|<p>THE EUROPEAN PARLIAMENT,</p>|<p>THE EUROPEAN COMMUNITIES,</p>|<p>THE EUROPEAN PARLIAMENT AND THE COUNCIL,</p>|<p>The GOVERNING COUNCIL OF THE EUROPEAN CENTRAL BANK,</p>|<p>THE MANAGEMENT BOARD,</p>)"
formula_match = re.search(formula_pattern, html_content)
if formula_match:
formula_html = formula_match.group(1)
return formula_html
return None
def extract_citations_and_recitals(preamble):
# Split the preamble into citations and recitals using 'Whereas' as the separator
......@@ -331,6 +444,20 @@ def extract_citations_and_recitals(preamble):
return citations_part, recitals_part
def extract_citations_and_recitals_from_html(html_content):
# Split the preamble into citations and recitals using 'Whereas' as the separator
splitters = ["<p>Whereas:</p>", "<p>WHEREAS:</p>", "<p>Whereas,</p>", "<p>Whereas</p>", "<p>whereas:</p>"]
for splitter in splitters:
if splitter in html_content:
citations_part, recitals_part = html_content.split(splitter, 1)
break
else:
citations_part = None
recitals_part = None
return citations_part, recitals_part
def extract_citations(citations_part):
if citations_part:
......@@ -344,6 +471,18 @@ def extract_citations(citations_part):
return []
def extract_citations_from_html(citations_part):
if citations_part:
return [
citation.strip() + "</p>"
for citation in citations_part.split("</p>\n")
if citation.strip()
]
else:
print("citation part is empty!")
return []
def extract_recitals(recitals_text, celex_id):
recitals = {}
duplicates = []
......@@ -389,6 +528,51 @@ def extract_recitals(recitals_text, celex_id):
return result
def extract_recitals_from_html(recitals_text, celex_id):
recitals = {}
duplicates = []
result = {
"recitals": {},
"duplicates": [],
"errors": []
}
# Corrected pattern to match actual new lines and the digit in parentheses
pattern = r"\n(?:<p>\s*(?:\((\d+)\)|<p>\((\d+)\)))"
# Find all matches of the pattern
matches = list(re.finditer(pattern, recitals_text))
for i in range(len(matches)):
start = matches[i].start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(recitals_text)
recital_number = int(matches[i].group(1) or matches[i].group(2))
# if recital_number == 141:
# print(f"recital_number: {recital_number}")
# print(f"recitals_text[start:end]: {recitals_text[start:end]}")
recital_text = recitals_text[start:end].strip()
if recital_number in recitals:
duplicates.append(recital_number)
# Extract the number value from the beginning of the recital text
number_match = re.match(r"^\s*(?:\\\((\d+)\\\)|\((\d+)\))", recital_text)
if number_match:
corrected_number = max(recitals.keys()) + 1
recital_text = re.sub(
r"^\s*(?:\\\(\d+\\\)|\(\d+\))", f"({corrected_number})", recital_text
)
recitals[corrected_number] = recital_text
else:
result["errors"].append(f"Recital with no number found: {recital_text}")
else:
recitals[recital_number] = recital_text
result["recitals"] = recitals
result["duplicates"] = duplicates
return result
def extract_preamble_from_xml(xml_content):
root = ET.fromstring(xml_content)
......@@ -597,6 +781,49 @@ def create_preamble_dataset_from_md(content_files):
return preamble_dataset, failed_analysis, recitals_duplicates
def create_preamble_dataset_from_html(content_files):
preamble_dataset = []
failed_analysis = []
recitals_duplicates = []
for file_path in tqdm(content_files, desc="Processing HTML files"):
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
filename_no_ext = os.path.splitext(os.path.basename(file_path))[0]
if content.strip().startswith("<img"):
failed_analysis.append({
"file_path": file_path,
"error": "File starts with image tag"
})
continue
preamble_data = extract_preamble_from_html(content, file_path)
if preamble_data["success"]:
preamble_dataset.append({
"celex_id": filename_no_ext,
"formula": preamble_data["data"]["formula"],
"citations": preamble_data["data"]["citations"],
"recitals": preamble_data["data"]["recitals"],
"preamble_final": preamble_data["data"]["preamble_final"],
"footnotes": preamble_data["data"]["footnotes"]
})
if preamble_data["data"]["recitals_duplicates"]:
recitals_duplicates.append({
"celex_id": filename_no_ext,
"duplicates": preamble_data["data"]["recitals_duplicates"]
})
else:
failed_analysis.append({
"file_path": file_path,
"error": preamble_data["error"]
})
return preamble_dataset, failed_analysis, recitals_duplicates
def create_preamble_dataset_from_xml(xml_directory, output_file):
preamble_dataset = []
......@@ -1598,6 +1825,32 @@ def extract_footnotes(content):
return footnotes
def extract_footnotes_from_html(html_content):
"""Extract footnotes from HTML content using BeautifulSoup."""
from bs4 import BeautifulSoup
footnotes = {}
soup = BeautifulSoup(html_content, 'html.parser')
# Find the footnotes section
footnotes_section = soup.find('section', id='footnotes')
if footnotes_section:
# Find all footnote list items
footnote_items = footnotes_section.find_all('li')
for item in footnote_items:
number = item.get('id').replace('fn', '')
# Find and remove the backlink
p_tag = item.find('p')
backlink = p_tag.find('a', class_='footnote-back')
if backlink:
backlink.decompose()
# Get the remaining HTML content
footnote_content = str(p_tag)
footnotes[number] = footnote_content
return footnotes
# --------------- TAGS ANALYSIS ---------------
def analyze_tag_usage(xml_files, root_tag="preface"):
......@@ -1621,6 +1874,31 @@ def analyze_tag_usage(xml_files, root_tag="preface"):
return tag_counts
def get_files_with_specific_tags(xml_files, specific_tags, root_tag="preface"):
"""
Get a list of file IDs (filenames without extension) containing specific XML tags.
Args:
xml_files: List of XML file paths
specific_tags: Set of tags to search for
root_tag: Root tag to start searching from
Returns:
List of file IDs that contain any of the specific tags
"""
files_with_tags = []
for xml_file in tqdm(xml_files, desc="Searching for specific tags"):
with open(xml_file, "r", encoding="utf-8") as file:
xml_content = file.read()
doc_tags = extract_unique_tags_from_xml(xml_content, root_tag=root_tag)
if any(tag in doc_tags for tag in specific_tags):
file_id = os.path.splitext(os.path.basename(xml_file))[0]
files_with_tags.append(file_id)
return files_with_tags
def extract_tags(xml_string):
try:
root = ET.fromstring(xml_string)
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment