Code development platform for open source projects from the European Union institutions

Skip to content
Snippets Groups Projects
Commit 7b66b641 authored by Julien Nowak's avatar Julien Nowak
Browse files

Add crawler with XML classifier to discover schema

parent 27ebdb51
No related branches found
No related tags found
2 merge requests!16AITED-117: Glue tables and crawlers,!13AITED-117 : Create Glue tables and crawlers
Pipeline #45166 passed
......@@ -17,9 +17,9 @@ terraform_s3_bucket_name = "d-ew1-ted-ai-terraform"
terraform_dynamodb_table_name = "d-ew1-ted-ai-terraform-locks"
# TED AI project
s3_input_bucket_name = "d-ew1-ted-ai-input"
s3_data_bucket_name = "d-ew1-ted-ai-experiments-data"
s3_ml_data_bucket_name = "d-ew1-ted-ai-ml-data"
s3_input_bucket_name = "d-ew1-ted-ai-input"
s3_data_bucket_name = "d-ew1-ted-ai-experiments-data"
s3_ml_data_bucket_name = "d-ew1-ted-ai-ml-data"
s3_ml_models_bucket_name = "d-ew1-ted-ai-ml-models"
ingestion_checkpoint_table = "d-ew1-ted-ai-ingestion-checkpoint"
......
......@@ -45,9 +45,9 @@ module "queue" {
}
module "classifiers" {
source = "./modules/classifiers"
source = "./modules/classifiers"
sagemaker_classifiers_repository_name = var.sagemaker_classifiers_repository_name
tags = var.tags
tags = var.tags
}
module "glue" {
......
......@@ -6,7 +6,7 @@ resource "aws_ecr_repository" "sagemaker_classifiers" {
scan_on_push = true
}
tags = var.tags
tags = var.tags
}
resource "aws_ecr_repository" "sagemaker_classifiers_ci" {
......@@ -16,7 +16,7 @@ resource "aws_ecr_repository" "sagemaker_classifiers_ci" {
image_scanning_configuration {
scan_on_push = true
}
tags = var.tags
tags = var.tags
}
resource "aws_ecr_lifecycle_policy" "sagemaker_classifiers_ci" {
......
......@@ -2,8 +2,8 @@ resource "aws_glue_catalog_database" "aws_glue_catalog_database" {
name = var.glue_database_name
}
resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" {
name = "d_ew1_ted_ai_input_change_notice_xml"
resource "aws_glue_catalog_table" "glue_input_change_notice_r2_0_8_s02_xml_table" {
name = "d_ew1_ted_ai_input_change_notice_r2_0_8_s02_xml"
database_name = var.glue_database_name
table_type = "EXTERNAL_TABLE"
......@@ -14,24 +14,7 @@ resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" {
}
storage_descriptor {
location = "s3://${var.s3_bucket_map["input_bucket"].name}/resource_type=change_notice/format=xml/version=R2.0.8.S02.E01/"
input_format = "org.apache.hadoop.mapred.XmlInputFormat"
output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyXmlOutputFormat"
ser_de_info {
name = "input-data-stream"
serialization_library = "com.ibm.spss.hive.serde2.xml.XmlInputFormat"
parameters = {
"serialization.format" = 1
}
}
columns {
name = "TECHNICAL_SECTION"
type = "struct<RECEPTION_ID:string>"
comment = "from deserializer"
}
location = "s3://${var.s3_bucket_map["input_bucket"].name}/resource_type=change_notice/format=xml/version=R2.0.8.S02.E01/"
}
partition_keys {
......@@ -52,20 +35,29 @@ resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" {
}
}
resource "aws_glue_classifier" "glue_xml_classifier" {
name = "d-ew1-ted-ai-glue-xml-classifier"
xml_classifier {
classification = "XML"
row_tag = "TED_EXPORT"
}
}
resource "aws_glue_crawler" "glue_input_xml_data_crawler" {
database_name = var.glue_database_name
name = "d-ew1-ted-ai-input-data-crawler"
role = aws_iam_role.input_crawler.arn
classifiers = [aws_glue_classifier.glue_xml_classifier.name]
tags = var.tags
schema_change_policy {
delete_behavior = "LOG"
update_behavior = "LOG"
}
catalog_target {
database_name = var.glue_database_name
tables = [aws_glue_catalog_table.glue_input_change_notice_xml_table.name] #TODO: we can put multiple tables here ?
tables = [aws_glue_catalog_table.glue_input_change_notice_r2_0_8_s02_xml_table.name]
}
configuration = <<EOF
......@@ -143,6 +135,7 @@ data "aws_iam_policy_document" "glue_job_policy_document" {
"glue:BatchStopJobRun",
"glue:GetTable",
"glue:CreateTable",
"glue:UpdateTable",
"glue:GetPartitions",
"glue:BatchGetPartition",
"glue:BatchCreatePartition",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment