Code development platform for open source projects from the European Union institutions

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ai4xml/playground
1 result
Show changes
Commits on Source (4)
......@@ -9,3 +9,6 @@ analyze_akn_datasets/__pycache__
Conval_API/ids.csv
prompt_engineering_experiments/evaluation_results/
prompt_engineering_experiments/data/*
dspy_programs/data
.DS_Store
%% Cell type:code id: tags:
``` python
!pip install tqdm
!pip install lxml
```
%% Output
Requirement already satisfied: tqdm in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (4.66.2)
Requirement already satisfied: lxml in /home/nasredine/dev/work/ai4xml/playground/myenv/lib/python3.11/site-packages (5.2.2)
%% Cell type:code id: tags:
``` python
import os
from functions import *
```
%% Cell type:code id: tags:
``` python
cwd = os.getcwd()
documents_dir = os.path.join(f'{cwd}/data/genai4lex', 'Documents')
results_dir = os.path.join(cwd, 'results/genai4lex')
```
%% Cell type:markdown id: tags:
### download AKN documents from genai4lex repo
%% Cell type:code id: tags:
``` python
# Create results_dir if it does not exist
if not os.path.isdir(results_dir):
os.makedirs(results_dir)
```
%% Cell type:code id: tags:
``` python
# Check if the 'Documents' folder exists, if not, download and extract the zip file
if not os.path.isdir(documents_dir):
zip_url = "https://gitlab.com/CIRSFID/genai4lex/-/raw/main/LegalResources/Eur-Lex/2010-2021/Documents.zip?inline=false"
download_and_extract_zip(cwd, zip_url) # Ensure this function is defined to handle download and extraction
if not os.path.isdir(documents_dir):
print("Invalid directory path.")
exit()
```
%% Cell type:markdown id: tags:
### download AKN schema
%% Cell type:code id: tags:
``` python
schema_dir = os.path.join(cwd, 'schema')
os.makedirs(schema_dir, exist_ok=True)
# Download Akoma Ntoso Schema
schema_url = "https://docs.oasis-open.org/legaldocml/akn-core/v1.0/os/part2-specs/schemas/akomantoso30.xsd"
schema_path = os.path.join(schema_dir, 'akomantoso30.xsd')
if not os.path.exists(schema_path):
download_schema(schema_url, schema_path)
```
%% Cell type:markdown id: tags:
### Analysis and statistics
%% Cell type:code id: tags:
``` python
# Assuming analyze_xml_files and the associated functions are defined elsewhere
results, stats = analyze_xml_files(documents_dir,schema_path)
```
%% Output
Analyzing XML files: 100%|██████████| 15283/15283 [00:22<00:00, 674.22it/s]
%% Cell type:code id: tags:
``` python
output_csv_path = os.path.join(results_dir, 'results.csv')
# Write results to CSV
write_results_to_csv(results, output_csv_path) # Ensure this function is defined
stats_file_path = os.path.join(results_dir, 'statistics.csv')
write_stats_to_file(stats, stats_file_path) # Ensure this function is defined
```
%% Cell type:code id: tags:
``` python
print(stats)
```
%% Output
{'Average Total Pages': 4.1452594385919, 'Missing Total Pages': 0, 'Missing OJ Number': 0, 'Missing Publication Date': 0, 'Earliest Publication Date': '2010-01-05', 'Latest Publication Date': '2021-08-09'}
%% Cell type:markdown id: tags:
## Schema validation
%% Cell type:markdown id: tags:
### validate documents
%% Cell type:code id: tags:
``` python
results, valid, invalid = validate_xml_files(documents_dir, schema_path)
```
%% Output
Validating XML files: 100%|██████████| 15283/15283 [00:16<00:00, 934.79file/s]
%% Cell type:code id: tags:
``` python
output_csv_path = os.path.join(results_dir, 'validation_results.csv')
write_results_to_csv2(results, output_csv_path)
print(f"Validation results have been written to {output_csv_path}")
```
%% Output
Validation results have been written to /home/nasredine/dev/work/ai4xml/playground/analyze_akn_datasets/results/genai4lex/validation_results.csv
%% Cell type:code id: tags:
``` python
print(f'Number of valid files {valid}')
print(f'Number of invalid files {invalid}')
```
%% Output
Number of valid files 12759
Number of invalid files 2524
......
%% Cell type:code id: tags:
``` python
!pip install dspy-ai
!pip install python-dotenv
!pip install rouge-score
```
%% Cell type:code id: tags:
``` python
from dotenv import load_dotenv
import os
import json
```
%% Cell type:code id: tags:
``` python
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
```
%% Cell type:markdown id: tags:
## Step 1: Setup
%% Cell type:code id: tags:
``` python
import dspy
turbo = dspy.OpenAI(api_key=api_key, model='gpt-3.5-turbo')
dspy.settings.configure(lm=turbo)
```
%% Cell type:markdown id: tags:
## Step 2: Define Signatures
%% Cell type:code id: tags:
``` python
# class Document:
# """A simple document class to simulate the expected structure."""
# def __init__(self, text):
# self.sections = [Section(text)]
# class Section:
# """A section of the document."""
# def __init__(self, text):
# self.text = text
class GenerateAKN(dspy.Signature):
"""Create an XML representation of a document cover page in the Akoma Ntoso (AKN) format."""
text = dspy.InputField(desc="Raw text format of the document cover page")
xml = dspy.OutputField(desc="Akoma Ntoso (AKN) XML representation of the input cover page")
```
%% Cell type:markdown id: tags:
## Step 3: Building the Transformation Pipeline
%% Cell type:code id: tags:
``` python
class DocumentToXMLPipeline(dspy.Module):
def __init__(self):
super().__init__()
self.transform = dspy.ChainOfThought(GenerateAKN)
def forward(self, text):
# Assuming there's some text to process, otherwise return an empty XML structure
if not text:
return ""
# Generate XML for the cover page
xml_cover_page = self.transform(text=text)
# Wrap in a root element
full_xml = f"<root>{xml_cover_page.xml}</root>"
return full_xml
```
%% Cell type:markdown id: tags:
## Step 4: Executing the Pipeline (0-shot conversion without optimization)
%% Cell type:code id: tags:
``` python
xml_pipeline = DocumentToXMLPipeline()
```
%% Cell type:code id: tags:
``` python
def process_documents(dataset):
cover_pages = [item['plain_text'] for item in dataset]
results = []
for doc in cover_pages:
xml_output = xml_pipeline(doc)
results.append(xml_output)
return results
```
%% Cell type:code id: tags:
``` python
# Running the pipeline with the example dataset
full_xml_outputs = process_documents(example_dataset)
for output in full_xml_outputs:
print(output)
```
%% Output
Prediction(
rationale='produce the xml. We will first identify the key elements of the cover page such as the title, date, proposal number, and the entities involved. We will then structure this information in the Akoma Ntoso (AKN) format.',
xml='```xml\n<coverPage>\n <title>Proposal for a REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL amending Regulation (EC) No 1008/2008 on common rules for the operation of air services in the Community</title>\n <date>21.12.2016</date>\n <proposalNumber>2016/0411 (COD)</proposalNumber>\n <entities>\n <entity type="author">'
)
Prediction(
rationale='produce the xml. We need to identify the key elements of the document cover page such as the title, date, file number, sender, recipient, and subject. We will then structure this information in the Akoma Ntoso (AKN) format.',
xml='```xml\n<coverPage>\n <title>COUNCIL OF THE EUROPEAN UNION</title>\n <date>27 February 2017</date>\n <language>en</language>\n <fileNumber>2016/0030 (COD)</fileNumber>\n <sender>General Secretariat of the Council</sender>\n <recipient>Permanent Representatives Committee</recipient>\n <subject>Proposal for a REGULATION OF THE EUROPEAN'
)
%% Cell type:markdown id: tags:
## Step 5: Optimizing the Pipeline
%% Cell type:code id: tags:
``` python
def load_data_from_json(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
return data
def prepare_example(text, expected_xml):
# Assuming 'dspy.Example' is the correct class from your DSPy framework
example = dspy.Example({
'text': text.strip(), # Using strip() to clean whitespace
'expected_xml': expected_xml.strip()
}).with_inputs("text")
return example
def create_dataset(data):
return [prepare_example(item['text'], item['expected_xml']) for item in data.values()]
```
%% Cell type:code id: tags:
``` python
file_path = '/Users/nasredine/dev/work/playground/dspy_programs/prefaces.json'
# Load and prepare the dataset
data = load_data_from_json(file_path)
trainset = create_dataset(data)
```
%% Cell type:code id: tags:
``` python
trainset
```
%% Output
[Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/29</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n <p>concerning the non-approval of<span class="ITALIC">Vitis vinifera</span>cane tannins as a basic substance in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>Commission Delegated Directive (EU) 2020/12</p>\n <p>of<docDate date="2019-08-02">2 August 2019</docDate></p>\n <p>supplementing Directive (EU) 2017/2397 of the European Parliament and of the Council as regards the standards for competences and corresponding knowledge and skills, for the practical examinations, for the approval of simulators and for medical fitness</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>UN<span><docType>Regulation</docType>No 53</span>— Uniform provisions concerning the approval of category L<span class="SUB">3</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/31]</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/24</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n <p>authorising an extension of use of chia seeds (<span class="ITALIC">Salvia hispanica</span>) as a novel food and the change of the conditions of use and the specific labelling requirements of chia seeds (<span class="ITALIC">Salvia hispanica</span>) under Regulation (EU) 2015/2283 of the European Parliament and of the Council and amending Commission Implementing Regulation (EU) 2017/2470</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/23</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n <p>concerning the non-renewal of the approval of the active substance thiacloprid, in accordance with Regulation (EC) No 1107/2009 of the European Parliament and of the Council concerning the placing of plant protection products on the market, and amending the Annex to Commission Implementing Regulation (EU) No 540/2011</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/22</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-31">31 October 2019</docDate></p>\n <p>amending Annexes I and III to Regulation (EU) 2019/631 of the European Parliament and of the Council as regards the monitoring of CO<span class="SUB">2</span>emissions from new light commercial vehicles type-approved in a multi-stage process</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/25</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-13">13 January 2020</docDate></p>\n <p>amending and correcting Regulation (EC) No 1235/2008 laying down detailed rules for implementation of Council Regulation (EC) No 834/2007 as regards the arrangements for imports of organic products from third countries</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>\n <span>Commission Implementing<docType>Regulation</docType>(EU) No<docNumber>2020/30</docNumber></span>\n </p>\n <p>of<docDate date="2020-01-14">14 January 2020</docDate></p>\n <p>amending Implementing Regulation (EU) No 404/2011 as regards detailed rules for the direct electronic exchange of information enacted under the rules of the Common Fisheries Policy</p>\n </longTitle>\n </preface>'}) (input_keys={'text'}),
Example({'text': '', 'expected_xml': '<preface>\n <longTitle>\n <p>UN<span><docType>Regulation</docType>No 74</span>— Uniform provisions concerning the approval of category L<span class="SUB">1</span>vehicles with regard to the installation of lighting and light-signalling devices [2020/32]</p>\n </longTitle>\n </preface>'}) (input_keys={'text'})]
%% Cell type:code id: tags:
``` python
trainset[0]['expected_xml']
```
%% Output
'<preface>\n <longTitle>\n <p>\n <span>Commission Delegated<docType>Regulation</docType>(EU) No<docNumber>2020/11</docNumber></span>\n </p>\n <p>of<docDate date="2019-10-29">29 October 2019</docDate></p>\n <p>amending Regulation (EC) No 1272/2008 of the European Parliament and of the Council on classification, labelling and packaging of substances and mixtures as regards information relating to emergency health response</p>\n <p>(Text with EEA relevance)</p>\n </longTitle>\n </preface>'
%% Cell type:code id: tags:
``` python
from rouge_score import rouge_scorer
```
%% Cell type:code id: tags:
``` python
def validate_xml_rouge_score(example, pred,trace=None):
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(example['expected_xml'], pred)
# Extracting the F1 scores from the results
rouge1_f1 = scores['rouge1'].fmeasure
rougeL_f1 = scores['rougeL'].fmeasure
print("ROUGE-1 F1:", rouge1_f1, "| ROUGE-L F1:", rougeL_f1)
# Setting a threshold for ROUGE-L
return rougeL_f1 >= 0.0 # Threshold can be adjusted as needed
```
%% Cell type:code id: tags:
``` python
from dspy.teleprompt import BootstrapFewShot
teleprompter = BootstrapFewShot(metric=validate_xml_rouge_score)
compiled_pipeline = teleprompter.compile(DocumentToXMLPipeline(), trainset=trainset)
```
%% Output
100%|██████████| 2/2 [00:00<00:00, 34.16it/s]
ROUGE-1 F1: 0.430939226519337 | ROUGE-L F1: 0.34254143646408847
ROUGE-1 F1: 0.22372881355932203 | ROUGE-L F1: 0.21694915254237288
Bootstrapped 2 full traces after 2 examples in round 0.
This diff is collapsed.
%% Cell type:markdown id: tags:
downlod data
%% Cell type:code id: tags:
``` python
import requests
import zipfile
import os
```
%% Cell type:code id: tags:
``` python
def download_and_extract_zip(url, extract_to):
"""
Downloads a ZIP file from a URL and extracts it to a specified directory.
Only downloads and extracts if the directory does not already contain data.
"""
# Check if the directory already contains data
if os.listdir(extract_to):
print(f"Data already exists in {extract_to}. Skipping download.")
return
# Make sure the output directory exists
os.makedirs(extract_to, exist_ok=True)
# Get the content from the URL
response = requests.get(url)
response.raise_for_status() # Check that the request was successful
# Path to save the downloaded ZIP file
zip_path = os.path.join(extract_to, 'downloaded_files.zip')
# Write the content to a ZIP file
with open(zip_path, 'wb') as f:
f.write(response.content)
# Open the ZIP file and extract its contents
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_to)
# Optionally, remove the ZIP file after extraction
os.remove(zip_path)
print(f"Files extracted to {extract_to}")
```
%% Cell type:code id: tags:
``` python
# URL of the file to be downloaded
s3_url = "https://ai4xml-data.s3.eu-west-1.amazonaws.com/planJO/selection_for_gen4ai/gen4ai_related_files.zip"
# Directory to store the extracted files
output_dir = 'data/genai4lex_word_docs/'
download_and_extract_zip(s3_url, output_dir)
```
%% Cell type:code id: tags:
``` python
from functions import *
```
%% Cell type:code id: tags:
``` python
# Specify the folder containing XML files and the output JSON file name
xml_folder = 'data/genai4lex_word_docs_xml'
output_json = 'prefaces.json'
extract_preface_content(xml_folder, output_json)
```
%% Cell type:markdown id: tags:
XML comparaison for test
replace rouge ??
%% Cell type:code id: tags:
``` python
!pip install xmldiff
```
%% Output
Requirement already satisfied: xmldiff in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (2.7.0)
Requirement already satisfied: setuptools in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (58.0.4)
Requirement already satisfied: lxml>=3.1.0 in /Users/nasredine/dev/work/playground/.venv/lib/python3.9/site-packages (from xmldiff) (5.2.2)
%% Cell type:code id: tags:
``` python
from xmldiff import main, formatting
def compare_xml_content(xml1, xml2):
""" Compare two XML documents and return the differences. """
diffs = main.diff_texts(xml1, xml2, formatter=formatting.XMLFormatter())
return diffs
# Example XML documents
xml1 = """<root>
<child1 attribute="value1">Text1</child1>
<child2>Text2</child2>
</root>"""
xml2 = """<root>
<child1 attribute="value1">Text1</child1>
<child2>Text3</child2> <!-- Changed text -->
<child3>New child</child3> <!-- New element -->
</root>"""
# Get the difference
difference = compare_xml_content(xml1, xml2)
print("Differences:", difference)
```
%% Output
Differences: <root xmlns:diff="http://namespaces.shoobx.com/diff">
<child1 attribute="value1">Text1</child1>
<child2>Text<diff:delete>2</diff:delete><diff:insert>3</diff:insert></child2><diff:delete>
</diff:delete><diff:insert> </diff:insert><child3 diff:insert="">New child</child3><diff:insert> </diff:insert></root>
%% Cell type:code id: tags:
``` python
import xml.etree.ElementTree as ET
import logging
class XmlTree():
def __init__(self):
self.hdlr = logging.FileHandler('xml-comparison.log')
self.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
@staticmethod
def convert_string_to_tree( xmlString):
return ET.fromstring(xmlString)
def xml_compare(self, x1, x2, excludes=[]):
"""
Compares two xml etrees
:param x1: the first tree
:param x2: the second tree
:param excludes: list of string of attributes to exclude from comparison
:return:
True if both files match
"""
if x1.tag != x2.tag:
self.logger.debug('Tags do not match: %s and %s' % (x1.tag, x2.tag))
return False
for name, value in x1.attrib.items():
if not name in excludes:
if x2.attrib.get(name) != value:
self.logger.debug('Attributes do not match: %s=%r, %s=%r'
% (name, value, name, x2.attrib.get(name)))
return False
for name in x2.attrib.keys():
if not name in excludes:
if name not in x1.attrib:
self.logger.debug('x2 has an attribute x1 is missing: %s'
% name)
return False
if not self.text_compare(x1.text, x2.text):
self.logger.debug('text: %r != %r' % (x1.text, x2.text))
return False
if not self.text_compare(x1.tail, x2.tail):
self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
return False
cl1 = x1.getchildren()
cl2 = x2.getchildren()
if len(cl1) != len(cl2):
self.logger.debug('children length differs, %i != %i'
% (len(cl1), len(cl2)))
return False
i = 0
for c1, c2 in zip(cl1, cl2):
i += 1
if not c1.tag in excludes:
if not self.xml_compare(c1, c2, excludes):
self.logger.debug('children %i do not match: %s'
% (i, c1.tag))
return False
return True
def text_compare(self, t1, t2):
"""
Compare two text strings
:param t1: text one
:param t2: text two
:return:
True if a match
"""
if not t1 and not t2:
return True
if t1 == '*' or t2 == '*':
return True
return (t1 or '').strip() == (t2 or '').strip()
```
%% Cell type:code id: tags:
``` python
xml1 = "<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
xml2 = "<note><to>Tove</to><from>Daniel</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
tree1 = XmlTree.convert_string_to_tree(xml1)
tree2 = XmlTree.convert_string_to_tree(xml2)
comparator = XmlTree()
if comparator.xml_compare(tree1, tree2, ["from"]):
print ("XMLs match")
else:
print ("XMLs don't match")
```
%% Output
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[6], line 10
6 tree2 = XmlTree.convert_string_to_tree(xml2)
8 comparator = XmlTree()
---> 10 if comparator.xml_compare(tree1, tree2, ["from"]):
11 print ("XMLs match")
12 else:
Cell In[4], line 46, in XmlTree.xml_compare(self, x1, x2, excludes)
44 self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
45 return False
---> 46 cl1 = x1.getchildren()
47 cl2 = x2.getchildren()
48 if len(cl1) != len(cl2):
AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'getchildren'
%% Cell type:markdown id: tags:
# extract text using pydoc
%% Cell type:code id: tags:
``` python
# Set the path to your documents and the output JSON file
root_folder = '/Users/nasredine/dev/work/playground/dspy_programs/data/genai4lex_word_docs_xml'
output_json = 'md_text_output.json'
process_documents(root_folder, output_json)
```
%% Cell type:code id: tags:
``` python
# Example usage
xml_data = """<your xml string here>"""
md_data = """<your markdown string here>"""
result = extract_and_find(xml_data, md_data)
print(result)
```
This diff is collapsed.
import re
import os
import json
import subprocess
import xml.etree.ElementTree as ET
import requests
import zipfile
def normalize_text(text):
text = re.sub(r'\s+', ' ', text) # Replace multiple whitespace with single space
return text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
def extract_and_find(xml_string, md_string, length=30):
# Extract from XML
normalized_xml = normalize_text(xml_string)
start_segment = normalized_xml[:length]
end_segment = normalized_xml[-length:]
# Normalize MD
normalized_md = normalize_text(md_string)
# Find start and end segments in MD
start_index = normalized_md.find(start_segment)
end_index = normalized_md.rfind(end_segment)
if start_index != -1 and end_index != -1 and start_index < end_index:
return f"Text likely spans from index {start_index} to {end_index + length} in the Markdown file."
return "Matching text not found in Markdown."
def remove_namespaces(xml_element):
""" Recursively remove namespace prefixes from an XML element and its children. """
for elem in xml_element.iter():
if '}' in elem.tag:
elem.tag = elem.tag.split('}', 1)[1] # Removes namespace
# Update attributes to remove namespaces
attributes = list(elem.attrib.keys())
for attr in attributes:
if '}' in attr:
new_attr = attr.split('}', 1)[1]
elem.attrib[new_attr] = elem.attrib.pop(attr)
def remove_namespaces(xml_element):
""" Recursively remove namespace prefixes from an XML element and its children. """
for elem in xml_element.iter():
if '}' in elem.tag:
elem.tag = elem.tag.split('}', 1)[1] # Removes namespace
# Update attributes to remove namespaces
attributes = list(elem.attrib.keys())
for attr in attributes:
if '}' in attr:
new_attr = attr.split('}', 1)[1]
elem.attrib[new_attr] = elem.attrib.pop(attr)
def extract_preface_content(xml_folder, output_json):
results = {}
# Iterate over every XML file in the folder and its subfolders
for root_dir, sub_dirs, files in os.walk(xml_folder):
for filename in files:
if filename.endswith('.xml'):
# Remove the file extension from the filename
filename_no_ext = os.path.splitext(filename)[0]
file_path = os.path.join(root_dir, filename)
# Parse the XML file
tree = ET.parse(file_path)
root = tree.getroot()
# Remove namespaces from the root element
remove_namespaces(root)
# Extract the entire <preface> element
preface = root.find('.//preface')
if preface is not None:
# Convert the <preface> element to a string including its content
preface_xml = ET.tostring(preface, encoding='unicode')
results[filename_no_ext] = {
'celex_id': filename_no_ext,
'expected_xml': preface_xml,
'text': ""
}
else:
results[filename_no_ext] = {
'celex_id': filename_no_ext,
'expected_xml': "No preface found",
'text': ""
}
# Write results to a JSON file
with open(output_json, 'w') as json_file:
json.dump(results, json_file, indent=4)
json.dump(results, json_file, indent=4)
def convert_docx_to_md(docx_path):
"""Convert a DOCX file to Markdown using Pandoc."""
try:
# Run pandoc to convert the docx file to markdown
result = subprocess.run(['pandoc', '-f', 'docx', '-t', 'markdown', docx_path], capture_output=True, text=True)
return result.stdout
except subprocess.CalledProcessError as e:
print("An error occurred while converting DOCX to Markdown:", e)
return None
def process_documents(root_folder, output_json):
results = {}
# Walk through the directory structure
for root, dirs, files in os.walk(root_folder):
for file in files:
if file.endswith('.docx'):
celex_id = os.path.basename(root) # Assuming the parent folder is the CELEX ID
docx_path = os.path.join(root, file)
markdown_text = convert_docx_to_md(docx_path)
if markdown_text:
results[celex_id] = markdown_text
# Save results to a JSON file
with open(output_json, 'w') as json_file:
json.dump(results, json_file, indent=4)
def download_and_extract_zip(script_dir, zip_url):
print("The 'Documents' folder does not exist. Downloading and extracting the zip file...")
# Download the zip file
zip_file = os.path.join(script_dir, "Documents.zip")
response = requests.get(zip_url)
with open(zip_file, "wb") as f:
f.write(response.content)
# Extract the zip file
with zipfile.ZipFile(zip_file, "r") as zip_ref:
zip_ref.extractall(script_dir)
# Remove the downloaded zip file
os.remove(zip_file)
\ No newline at end of file
This diff is collapsed.