Code development platform for open source projects from the European Union institutions

Skip to content
Snippets Groups Projects
glue.tf 5.33 KiB
Newer Older
Julien Nowak's avatar
Julien Nowak committed
resource "aws_glue_catalog_database" "aws_glue_catalog_database" {
  name = var.glue_database_name
}

resource "aws_glue_catalog_table" "glue_input_change_notice_xml_table" {
  name          = "d_ew1_ted_ai_input_change_notice_xml"
  database_name = var.glue_database_name

  table_type = "EXTERNAL_TABLE"

  parameters = {
    EXTERNAL              = "TRUE"
    "parquet.compression" = "UNCOMPRESSED"
  }

  storage_descriptor {
    location      = "s3://${var.s3_bucket_map["input_bucket"].name}/resource_type=change_notice/format=xml/version=R2.0.8.S02.E01/"
    input_format  = "org.apache.hadoop.mapred.XmlInputFormat"
    output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyXmlOutputFormat"
Julien Nowak's avatar
Julien Nowak committed

    ser_de_info {
      name                  = "input-data-stream"
      serialization_library = "com.ibm.spss.hive.serde2.xml.XmlInputFormat"
Julien Nowak's avatar
Julien Nowak committed

      parameters = {
        "serialization.format" = 1
      }
    }

    columns {
      name    = "TECHNICAL_SECTION"
      type    = "struct<RECEPTION_ID:string>"
Julien Nowak's avatar
Julien Nowak committed
      comment = "from deserializer"
    }
  }

  partition_keys {
    name = "year"
    type = "string"
  }
  partition_keys {
    name = "month"
    type = "string"
  }
  partition_keys {
    name = "day"
    type = "string"
  }
  partition_keys {
    name = "contract_id"
    type = "string"
  }
}

resource "aws_glue_crawler" "glue_input_xml_data_crawler" {
  database_name = var.glue_database_name
  name          = "d-ew1-ted-ai-input-data-crawler"
  role          = aws_iam_role.input_crawler.arn
Julien Nowak's avatar
Julien Nowak committed
  tags          = var.tags

  schema_change_policy {
    delete_behavior = "LOG"
    update_behavior = "LOG"
  }

  catalog_target {
    database_name = var.glue_database_name
    tables        = [aws_glue_catalog_table.glue_input_change_notice_xml_table.name] #TODO: we can put multiple tables here ?
  }

  configuration = <<EOF
{
  "Version":1.0,
  "Grouping": {
    "TableGroupingPolicy": "CombineCompatibleSchemas"
  },
  "CrawlerOutput": {
    "Partitions": { "AddOrUpdateBehavior": "InheritFromTable" }
  }
}
EOF
}

data "aws_iam_policy_document" "assume_glue_role" {
  statement {
    sid     = "StsAssumeGlueRole"
    actions = ["sts:AssumeRole"]
    principals {
      type        = "Service"
      identifiers = ["glue.amazonaws.com"]
    }
  }
}

resource "aws_iam_role" "input_crawler" {
  name               = "${var.iam_role_prefix}_GLUE_CRAWLER_INPUT_ROLE"
  description        = "DPD Execution Role for Glue crawler on input bucket"
  assume_role_policy = data.aws_iam_policy_document.assume_glue_role.json
  managed_policy_arns = [
    aws_iam_policy.dpd_storage_s3_input_bucket_readonly.arn,
    aws_iam_policy.glue_job_policy.arn,
    aws_iam_policy.cloudwatch_write_logs_policy.arn
Julien Nowak's avatar
Julien Nowak committed
  ]
  permissions_boundary = "arn:aws:iam::${var.account_id}:policy/Team_Admin_Boundary"
  tags                 = var.tags
Julien Nowak's avatar
Julien Nowak committed
}

resource "aws_iam_policy" "dpd_storage_s3_input_bucket_readonly" {
  name        = "${var.iam_policy_prefix}S3_STORAGE_INPUT_READ_ONLY_POLICY"
  description = "Read-only access to S3 input bucket"
  policy      = data.aws_iam_policy_document.dpd_storage_s3_input_bucket_readonly.json
}

data "aws_iam_policy_document" "dpd_storage_s3_input_bucket_readonly" {
  statement {
    sid = "S3getObjectInput"
    actions = [
      "s3:GetObject",
      "s3:GetObjectVersion"
    ]

    resources = ["${var.s3_bucket_map["input_bucket"].arn}/*"]
  }
  statement {
    sid = "S3ListBucketInput"
    actions = [
      "s3:ListBucket",
      "s3:ListBucketVersions"
    ]

    resources = [var.s3_bucket_map["input_bucket"].arn]
  }
}

data "aws_iam_policy_document" "glue_job_policy_document" {
  statement {
    sid    = "GlueReadStartJobRun"
    effect = "Allow"
    actions = [
      "glue:StartJobRun",
      "glue:GetJobRun",
      "glue:GetJobRuns",
      "glue:BatchStopJobRun",
      "glue:GetTable",
      "glue:CreateTable",
      "glue:GetPartitions",
      "glue:BatchGetPartition",
      "glue:BatchCreatePartition",
      "glue:UpdatePartition",
      "glue:GetDataCatalogEncryptionSettings",
      "glue:PutDataCatalogEncryptionSettings",
      "glue:CreateSecurityConfiguration",
      "glue:GetSecurityConfiguration",
      "glue:GetSecurityConfigurations",
      "glue:DeleteSecurityConfiguration"
    ]
    resources = ["*"]
  }
  statement {
    sid     = "GlueGetCatalogAndDpdDatabase"
    effect  = "Allow"
    actions = ["glue:GetDatabase"]
    resources = [
      "arn:aws:glue:${var.region}:${var.account_id}:catalog",
      "arn:aws:glue:${var.region}:${var.account_id}:database/${var.glue_database_name}",
    ]
  }
}

resource "aws_iam_policy" "glue_job_policy" {
  name        = "${var.iam_policy_prefix}_GLUE_JOB_POLICY"
  description = "Policy to allow a step function to manage a Glue Job"
  policy      = data.aws_iam_policy_document.glue_job_policy_document.json
}

data "aws_iam_policy_document" "cloudwatch_write_logs_policy_document" {
  statement {
    sid    = "CloudWatchWriteLogs"
    effect = "Allow"
    actions = [
      "logs:CreateLogGroup",
      "logs:CreateLogStream",
      "logs:PutLogEvents"
    ]
    resources = ["arn:aws:logs:${var.region}:${var.account_id}:log-group:*"]
  }
}

resource "aws_iam_policy" "cloudwatch_write_logs_policy" {
  name        = "${var.iam_policy_prefix}_CLOUDWATCH_WRITE_LOGS_POLICY"
  description = "Policy to allow writing logs in CloudWatch"
  policy      = data.aws_iam_policy_document.cloudwatch_write_logs_policy_document.json
}