From 7b66b641d30f7e9db24379e4dc214c25615dc793 Mon Sep 17 00:00:00 2001 From: nowakju <julien.nowak@arhs-spikeseed.com> Date: Mon, 17 Apr 2023 17:08:40 +0200 Subject: [PATCH] Add crawler with XML classifier to discover schema --- dev.tfvars | 6 +++--- main.tf | 4 ++-- modules/classifiers/ecr.tf | 4 ++-- modules/glue/glue.tf | 37 +++++++++++++++---------------------- 4 files changed, 22 insertions(+), 29 deletions(-) diff --git a/dev.tfvars b/dev.tfvars index 21436ea..3e634ea 100644 --- a/dev.tfvars +++ b/dev.tfvars @@ -17,9 +17,9 @@ terraform_s3_bucket_name = "d-ew1-ted-ai-terraform" terraform_dynamodb_table_name = "d-ew1-ted-ai-terraform-locks" # TED AI project -s3_input_bucket_name = "d-ew1-ted-ai-input" -s3_data_bucket_name = "d-ew1-ted-ai-experiments-data" -s3_ml_data_bucket_name = "d-ew1-ted-ai-ml-data" +s3_input_bucket_name = "d-ew1-ted-ai-input" +s3_data_bucket_name = "d-ew1-ted-ai-experiments-data" +s3_ml_data_bucket_name = "d-ew1-ted-ai-ml-data" s3_ml_models_bucket_name = "d-ew1-ted-ai-ml-models" ingestion_checkpoint_table = "d-ew1-ted-ai-ingestion-checkpoint" diff --git a/main.tf b/main.tf index eaeccdc..f7db16f 100644 --- a/main.tf +++ b/main.tf @@ -45,9 +45,9 @@ module "queue" { } module "classifiers" { - source = "./modules/classifiers" + source = "./modules/classifiers" sagemaker_classifiers_repository_name = var.sagemaker_classifiers_repository_name - tags = var.tags + tags = var.tags } module "glue" { diff --git a/modules/classifiers/ecr.tf b/modules/classifiers/ecr.tf index 3c7f5e3..ac40c3d 100644 --- a/modules/classifiers/ecr.tf +++ b/modules/classifiers/ecr.tf @@ -6,7 +6,7 @@ resource "aws_ecr_repository" "sagemaker_classifiers" { scan_on_push = true } - tags = var.tags + tags = var.tags } resource "aws_ecr_repository" "sagemaker_classifiers_ci" { @@ -16,7 +16,7 @@ resource "aws_ecr_repository" "sagemaker_classifiers_ci" { image_scanning_configuration { scan_on_push = true } - tags = var.tags + tags = var.tags } resource "aws_ecr_lifecycle_policy" "sagemaker_classifiers_ci" { diff --git a/modules/glue/glue.tf b/modules/glue/glue.tf index 23cd70e..d520382 100644 --- a/modules/glue/glue.tf +++ b/modules/glue/glue.tf @@ -2,8 +2,8 @@ resource "aws_glue_catalog_database" "aws_glue_catalog_database" { name = var.glue_database_name } -resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" { - name = "d_ew1_ted_ai_input_change_notice_xml" +resource "aws_glue_catalog_table" "glue_input_change_notice_r2_0_8_s02_xml_table" { + name = "d_ew1_ted_ai_input_change_notice_r2_0_8_s02_xml" database_name = var.glue_database_name table_type = "EXTERNAL_TABLE" @@ -14,24 +14,7 @@ resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" { } storage_descriptor { - location = "s3://${var.s3_bucket_map["input_bucket"].name}/resource_type=change_notice/format=xml/version=R2.0.8.S02.E01/" - input_format = "org.apache.hadoop.mapred.XmlInputFormat" - output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyXmlOutputFormat" - - ser_de_info { - name = "input-data-stream" - serialization_library = "com.ibm.spss.hive.serde2.xml.XmlInputFormat" - - parameters = { - "serialization.format" = 1 - } - } - - columns { - name = "TECHNICAL_SECTION" - type = "struct<RECEPTION_ID:string>" - comment = "from deserializer" - } + location = "s3://${var.s3_bucket_map["input_bucket"].name}/resource_type=change_notice/format=xml/version=R2.0.8.S02.E01/" } partition_keys { @@ -52,20 +35,29 @@ resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" { } } +resource "aws_glue_classifier" "glue_xml_classifier" { + name = "d-ew1-ted-ai-glue-xml-classifier" + + xml_classifier { + classification = "XML" + row_tag = "TED_EXPORT" + } +} + resource "aws_glue_crawler" "glue_input_xml_data_crawler" { database_name = var.glue_database_name name = "d-ew1-ted-ai-input-data-crawler" role = aws_iam_role.input_crawler.arn + classifiers = [aws_glue_classifier.glue_xml_classifier.name] tags = var.tags schema_change_policy { delete_behavior = "LOG" - update_behavior = "LOG" } catalog_target { database_name = var.glue_database_name - tables = [aws_glue_catalog_table.glue_input_change_notice_xml_table.name] #TODO: we can put multiple tables here ? + tables = [aws_glue_catalog_table.glue_input_change_notice_r2_0_8_s02_xml_table.name] } configuration = <<EOF @@ -143,6 +135,7 @@ data "aws_iam_policy_document" "glue_job_policy_document" { "glue:BatchStopJobRun", "glue:GetTable", "glue:CreateTable", + "glue:UpdateTable", "glue:GetPartitions", "glue:BatchGetPartition", "glue:BatchCreatePartition", -- GitLab