Code development platform for open source projects from the European Union institutions

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ai4xml/playground
1 result
Show changes
Commits on Source (2)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -19,7 +19,18 @@ def normalize_text(text):
text = re.sub(r'\s+', ' ', text) # Replace multiple whitespace with single space
return text.lower().strip() # Convert to lowercase and strip leading/trailing spaces
def normalize_text(text):
"""Normalize text by removing XML tags and all spaces, converting to lowercase."""
# Remove XML tags
no_tags = re.sub(r'<[^>]+>', '', text)
# Remove all spaces and convert to lowercase
return re.sub(r'\s+', '', no_tags).lower()
def extract_and_find(xml_string, md_string, length=30):
# Check if the input strings are empty
if not xml_string or not md_string:
return "Input strings must not be empty."
# Extract from XML
normalized_xml = normalize_text(xml_string)
start_segment = normalized_xml[:length]
......@@ -32,13 +43,40 @@ def extract_and_find(xml_string, md_string, length=30):
start_index = normalized_md.find(start_segment)
end_index = normalized_md.rfind(end_segment)
# Extract and return the content if both indices are found and valid
if start_index != -1 and end_index != -1 and start_index < end_index:
return f"Text likely spans from index {start_index} to {end_index + length} in the Markdown file."
# Adjust the end index to include the end segment in the result
end_index_adjusted = end_index + length
extracted_content = normalized_md[start_index:end_index_adjusted]
return f"Text spans from index {start_index} to {end_index_adjusted}, content: '{extracted_content}'"
return "Matching text not found in Markdown."
def extract_preface(content):
try:
pattern1 = r"\(Text with EEA relevance\)"
pattern2 = r"(THE EUROPEAN COMMISSION,|THE COUNCIL OF THE EUROPEAN UNION,|THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,)\s*Having"
# Search for the patterns
match1 = re.search(pattern1, content, re.IGNORECASE)
match2 = re.search(pattern2, content, re.IGNORECASE)
if match1:
return content[:match1.end()]
elif match2:
return content[:match2.start()]
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
def remove_namespaces(xml_element):
""" Recursively remove namespace prefixes from an XML element and its children. """
for elem in xml_element.iter():
......@@ -63,46 +101,27 @@ def remove_namespaces(xml_element):
new_attr = attr.split('}', 1)[1]
elem.attrib[new_attr] = elem.attrib.pop(attr)
def extract_preface_content(xml_folder, output_json):
results = {}
# Iterate over every XML file in the folder and its subfolders
for root_dir, sub_dirs, files in os.walk(xml_folder):
for filename in files:
if filename.endswith('.xml'):
# Remove the file extension from the filename
filename_no_ext = os.path.splitext(filename)[0]
file_path = os.path.join(root_dir, filename)
# Parse the XML file
tree = ET.parse(file_path)
root = tree.getroot()
# Remove namespaces from the root element
remove_namespaces(root)
# Extract the entire <preface> element
preface = root.find('.//preface')
if preface is not None:
# Convert the <preface> element to a string including its content
preface_xml = ET.tostring(preface, encoding='unicode')
results[filename_no_ext] = {
'celex_id': filename_no_ext,
'expected_xml': preface_xml,
'text': ""
}
else:
results[filename_no_ext] = {
'celex_id': filename_no_ext,
'expected_xml': "No preface found",
'text': ""
}
# Write results to a JSON file
with open(output_json, 'w') as json_file:
json.dump(results, json_file, indent=4)
json.dump(results, json_file, indent=4)
def extract_document_part(xml_file, target_tag):
result = {}
# Remove the file extension from the filename
filename_no_ext = os.path.splitext(os.path.basename(xml_file))[0]
# Parse the XML file
tree = ET.parse(xml_file)
root = tree.getroot()
# Remove namespaces from the root element
remove_namespaces(root)
# Extract the entire target element
target_element = root.find(f'.//{target_tag}')
if target_element is not None:
# Convert the target element to a string including its content
target_xml = ET.tostring(target_element, encoding='unicode')
return target_xml
return None
def convert_docx_to_md(docx_path):
......@@ -115,21 +134,12 @@ def convert_docx_to_md(docx_path):
print("An error occurred while converting DOCX to Markdown:", e)
return None
def process_documents(root_folder, output_json):
results = {}
# Walk through the directory structure
for root, dirs, files in os.walk(root_folder):
for file in files:
if file.endswith('.docx'):
celex_id = os.path.basename(root) # Assuming the parent folder is the CELEX ID
docx_path = os.path.join(root, file)
markdown_text = convert_docx_to_md(docx_path)
if markdown_text:
results[celex_id] = markdown_text
# Save results to a JSON file
with open(output_json, 'w') as json_file:
json.dump(results, json_file, indent=4)
def process_document(file_path):
if file_path.endswith('.docx'):
markdown_text = convert_docx_to_md(file_path)
if markdown_text:
return markdown_text
return None
def download_and_extract_zip(script_dir, zip_url):
print("The 'Documents' folder does not exist. Downloading and extracting the zip file...")
......@@ -175,7 +185,6 @@ def analyze_results(data):
for series_type, count in series_types.items():
print(f"{series_type}: {count}")
def analyze_xml_files(directory):
results = []
namespace = {
......@@ -207,13 +216,18 @@ def analyze_xml_files(directory):
# Extract series type
series_type = root.find('.//fmx:COLL', namespace).text if root.find('.//fmx:COLL', namespace) is not None else 'Not found'
# Extract total pages
total_pages = root.find('.//fmx:PAGE.TOTAL', namespace)
pages_total = total_pages.text if total_pages is not None else 'Not found'
results.append({
'File': filename,
'Act Type': act_type,
'Publication Date': pub_date,
'CELEX Number': celex_number,
'Language': language,
'Series Type': series_type
'Series Type': series_type,
'Total Pages': pages_total
})
except etree.XMLSyntaxError as e:
......
This diff is collapsed.