From 7b66b641d30f7e9db24379e4dc214c25615dc793 Mon Sep 17 00:00:00 2001
From: nowakju <julien.nowak@arhs-spikeseed.com>
Date: Mon, 17 Apr 2023 17:08:40 +0200
Subject: [PATCH] Add crawler with XML classifier to discover schema

---
 dev.tfvars                 |  6 +++---
 main.tf                    |  4 ++--
 modules/classifiers/ecr.tf |  4 ++--
 modules/glue/glue.tf       | 37 +++++++++++++++----------------------
 4 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/dev.tfvars b/dev.tfvars
index 21436ea..3e634ea 100644
--- a/dev.tfvars
+++ b/dev.tfvars
@@ -17,9 +17,9 @@ terraform_s3_bucket_name      = "d-ew1-ted-ai-terraform"
 terraform_dynamodb_table_name = "d-ew1-ted-ai-terraform-locks"
 
 # TED AI project
-s3_input_bucket_name   = "d-ew1-ted-ai-input"
-s3_data_bucket_name    = "d-ew1-ted-ai-experiments-data"
-s3_ml_data_bucket_name = "d-ew1-ted-ai-ml-data"
+s3_input_bucket_name     = "d-ew1-ted-ai-input"
+s3_data_bucket_name      = "d-ew1-ted-ai-experiments-data"
+s3_ml_data_bucket_name   = "d-ew1-ted-ai-ml-data"
 s3_ml_models_bucket_name = "d-ew1-ted-ai-ml-models"
 
 ingestion_checkpoint_table = "d-ew1-ted-ai-ingestion-checkpoint"
diff --git a/main.tf b/main.tf
index eaeccdc..f7db16f 100644
--- a/main.tf
+++ b/main.tf
@@ -45,9 +45,9 @@ module "queue" {
 }
 
 module "classifiers" {
-  source = "./modules/classifiers"
+  source                                = "./modules/classifiers"
   sagemaker_classifiers_repository_name = var.sagemaker_classifiers_repository_name
-  tags = var.tags
+  tags                                  = var.tags
 }
 
 module "glue" {
diff --git a/modules/classifiers/ecr.tf b/modules/classifiers/ecr.tf
index 3c7f5e3..ac40c3d 100644
--- a/modules/classifiers/ecr.tf
+++ b/modules/classifiers/ecr.tf
@@ -6,7 +6,7 @@ resource "aws_ecr_repository" "sagemaker_classifiers" {
     scan_on_push = true
   }
 
-  tags                       = var.tags
+  tags = var.tags
 }
 
 resource "aws_ecr_repository" "sagemaker_classifiers_ci" {
@@ -16,7 +16,7 @@ resource "aws_ecr_repository" "sagemaker_classifiers_ci" {
   image_scanning_configuration {
     scan_on_push = true
   }
-  tags                       = var.tags
+  tags = var.tags
 }
 
 resource "aws_ecr_lifecycle_policy" "sagemaker_classifiers_ci" {
diff --git a/modules/glue/glue.tf b/modules/glue/glue.tf
index 23cd70e..d520382 100644
--- a/modules/glue/glue.tf
+++ b/modules/glue/glue.tf
@@ -2,8 +2,8 @@ resource "aws_glue_catalog_database" "aws_glue_catalog_database" {
   name = var.glue_database_name
 }
 
-resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" {
-  name          = "d_ew1_ted_ai_input_change_notice_xml"
+resource "aws_glue_catalog_table" "glue_input_change_notice_r2_0_8_s02_xml_table" {
+  name          = "d_ew1_ted_ai_input_change_notice_r2_0_8_s02_xml"
   database_name = var.glue_database_name
 
   table_type = "EXTERNAL_TABLE"
@@ -14,24 +14,7 @@ resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" {
   }
 
   storage_descriptor {
-    location      = "s3://${var.s3_bucket_map["input_bucket"].name}/resource_type=change_notice/format=xml/version=R2.0.8.S02.E01/"
-    input_format  = "org.apache.hadoop.mapred.XmlInputFormat"
-    output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyXmlOutputFormat"
-
-    ser_de_info {
-      name                  = "input-data-stream"
-      serialization_library = "com.ibm.spss.hive.serde2.xml.XmlInputFormat"
-
-      parameters = {
-        "serialization.format" = 1
-      }
-    }
-
-    columns {
-      name    = "TECHNICAL_SECTION"
-      type    = "struct<RECEPTION_ID:string>"
-      comment = "from deserializer"
-    }
+    location = "s3://${var.s3_bucket_map["input_bucket"].name}/resource_type=change_notice/format=xml/version=R2.0.8.S02.E01/"
   }
 
   partition_keys {
@@ -52,20 +35,29 @@ resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" {
   }
 }
 
+resource "aws_glue_classifier" "glue_xml_classifier" {
+  name = "d-ew1-ted-ai-glue-xml-classifier"
+
+  xml_classifier {
+    classification = "XML"
+    row_tag        = "TED_EXPORT"
+  }
+}
+
 resource "aws_glue_crawler" "glue_input_xml_data_crawler" {
   database_name = var.glue_database_name
   name          = "d-ew1-ted-ai-input-data-crawler"
   role          = aws_iam_role.input_crawler.arn
+  classifiers   = [aws_glue_classifier.glue_xml_classifier.name]
   tags          = var.tags
 
   schema_change_policy {
     delete_behavior = "LOG"
-    update_behavior = "LOG"
   }
 
   catalog_target {
     database_name = var.glue_database_name
-    tables        = [aws_glue_catalog_table.glue_input_change_notice_xml_table.name] #TODO: we can put multiple tables here ?
+    tables        = [aws_glue_catalog_table.glue_input_change_notice_r2_0_8_s02_xml_table.name]
   }
 
   configuration = <<EOF
@@ -143,6 +135,7 @@ data "aws_iam_policy_document" "glue_job_policy_document" {
       "glue:BatchStopJobRun",
       "glue:GetTable",
       "glue:CreateTable",
+      "glue:UpdateTable",
       "glue:GetPartitions",
       "glue:BatchGetPartition",
       "glue:BatchCreatePartition",
-- 
GitLab