Compare revisions

Nasredine CHENIKI · Nasredine CHENIKI · 304f2e6c · 3bafce5c · 304f2e6c · 304f2e6c
--- a/dspy_programs/dataset_creation_prefaces.ipynb
+++ b/dspy_programs/dataset_creation_prefaces.ipynb
--- a/dspy_programs/dataset_preparation.ipynb
+++ b/dspy_programs/dataset_preparation.ipynb
--- a/dspy_programs/dataset_preparation_and_statistics.ipynb
+++ b/dspy_programs/dataset_preparation_and_statistics.ipynb
--- a/dspy_programs/functions.py
+++ b/dspy_programs/functions.py
@@ -19,7 +19,18 @@ def normalize_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    return text.lower().strip()  # Convert to lowercase and strip leading/trailing spaces

+def normalize_text(text):
+    """Normalize text by removing XML tags and all spaces, converting to lowercase."""
+    # Remove XML tags
+    no_tags = re.sub(r'<[^>]+>', '', text)
+    # Remove all spaces and convert to lowercase
+    return re.sub(r'\s+', '', no_tags).lower()
+
 def extract_and_find(xml_string, md_string, length=30):
+    # Check if the input strings are empty
+    if not xml_string or not md_string:
+        return "Input strings must not be empty."
+
    # Extract from XML
    normalized_xml = normalize_text(xml_string)
    start_segment = normalized_xml[:length]
@@ -32,13 +43,40 @@ def extract_and_find(xml_string, md_string, length=30):
    start_index = normalized_md.find(start_segment)
    end_index = normalized_md.rfind(end_segment)
    
+    # Extract and return the content if both indices are found and valid
    if start_index != -1 and end_index != -1 and start_index < end_index:
-        return f"Text likely spans from index {start_index} to {end_index + length} in the Markdown file."
+        # Adjust the end index to include the end segment in the result
+        end_index_adjusted = end_index + length
+        extracted_content = normalized_md[start_index:end_index_adjusted]
+        return f"Text spans from index {start_index} to {end_index_adjusted}, content: '{extracted_content}'"
    return "Matching text not found in Markdown."




+def extract_preface(content):
+    try:
+        pattern1 = r"\(Text with EEA relevance\)"
+        pattern2 = r"(THE EUROPEAN COMMISSION,|THE COUNCIL OF THE EUROPEAN UNION,|THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,)\s*Having"
+        
+        # Search for the patterns
+        match1 = re.search(pattern1, content, re.IGNORECASE)
+        match2 = re.search(pattern2, content, re.IGNORECASE)
+        
+        if match1:
+            return content[:match1.end()]
+        elif match2:
+            return content[:match2.start()]
+        
+        return None
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None
+
+
+
+
 def remove_namespaces(xml_element):
    """ Recursively remove namespace prefixes from an XML element and its children. """
    for elem in xml_element.iter():
@@ -63,46 +101,27 @@ def remove_namespaces(xml_element):
                new_attr = attr.split('}', 1)[1]
                elem.attrib[new_attr] = elem.attrib.pop(attr)

-def extract_preface_content(xml_folder, output_json):
-    results = {}
-
-    # Iterate over every XML file in the folder and its subfolders
-    for root_dir, sub_dirs, files in os.walk(xml_folder):
-        for filename in files:
-            if filename.endswith('.xml'):
-                # Remove the file extension from the filename
-                filename_no_ext = os.path.splitext(filename)[0]
-                file_path = os.path.join(root_dir, filename)
-                
-                # Parse the XML file
-                tree = ET.parse(file_path)
-                root = tree.getroot()
-                
-                # Remove namespaces from the root element
-                remove_namespaces(root)
-                
-                # Extract the entire <preface> element
-                preface = root.find('.//preface')
-                if preface is not None:
-                    # Convert the <preface> element to a string including its content
-                    preface_xml = ET.tostring(preface, encoding='unicode')
-                    results[filename_no_ext] = {
-                        'celex_id': filename_no_ext,
-                        'expected_xml': preface_xml,
-                        'text': ""
-                    }
-                else:
-                    results[filename_no_ext] = {
-                        'celex_id': filename_no_ext,
-                        'expected_xml': "No preface found",
-                        'text': ""
-                    }
-
-    # Write results to a JSON file
-    with open(output_json, 'w') as json_file:
-        json.dump(results, json_file, indent=4)
-        json.dump(results, json_file, indent=4)
+def extract_document_part(xml_file, target_tag):
+    result = {}
+
+    # Remove the file extension from the filename
+    filename_no_ext = os.path.splitext(os.path.basename(xml_file))[0]
+
+    # Parse the XML file
+    tree = ET.parse(xml_file)
+    root = tree.getroot()

+    # Remove namespaces from the root element
+    remove_namespaces(root)
+
+    # Extract the entire target element
+    target_element = root.find(f'.//{target_tag}')
+    if target_element is not None:
+        # Convert the target element to a string including its content
+        target_xml = ET.tostring(target_element, encoding='unicode')
+        return target_xml
+
+    return None


 def convert_docx_to_md(docx_path):
@@ -115,21 +134,12 @@ def convert_docx_to_md(docx_path):
        print("An error occurred while converting DOCX to Markdown:", e)
        return None

-def process_documents(root_folder, output_json):
-    results = {}
-    # Walk through the directory structure
-    for root, dirs, files in os.walk(root_folder):
-        for file in files:
-            if file.endswith('.docx'):
-                celex_id = os.path.basename(root)  # Assuming the parent folder is the CELEX ID
-                docx_path = os.path.join(root, file)
-                markdown_text = convert_docx_to_md(docx_path)
-                if markdown_text:
-                    results[celex_id] = markdown_text
-
-    # Save results to a JSON file
-    with open(output_json, 'w') as json_file:
-        json.dump(results, json_file, indent=4)
+def process_document(file_path):
+    if file_path.endswith('.docx'):
+        markdown_text = convert_docx_to_md(file_path)
+        if markdown_text:
+            return markdown_text
+    return None

 def download_and_extract_zip(script_dir, zip_url):
    print("The 'Documents' folder does not exist. Downloading and extracting the zip file...")
@@ -175,7 +185,6 @@ def analyze_results(data):
    for series_type, count in series_types.items():
        print(f"{series_type}: {count}")

-
 def analyze_xml_files(directory):
    results = []
    namespace = {
@@ -207,13 +216,18 @@ def analyze_xml_files(directory):
            # Extract series type
            series_type = root.find('.//fmx:COLL', namespace).text if root.find('.//fmx:COLL', namespace) is not None else 'Not found'

+            # Extract total pages
+            total_pages = root.find('.//fmx:PAGE.TOTAL', namespace)
+            pages_total = total_pages.text if total_pages is not None else 'Not found'
+
            results.append({
                'File': filename,
                'Act Type': act_type,
                'Publication Date': pub_date,
                'CELEX Number': celex_number,
                'Language': language,
-                'Series Type': series_type
+                'Series Type': series_type,
+                'Total Pages': pages_total
            })

        except etree.XMLSyntaxError as e:

--- a/dspy_programs/prefaces_dataset.json
+++ b/dspy_programs/prefaces_dataset.json
No results found