#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import logging
import tempfile
import unittest

import apache_beam as beam
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import equal_to
from apache_beam.yaml.yaml_transform import YamlTransform

try:
  # pylint: disable=wrong-import-order, wrong-import-position, unused-import
  from apache_beam.ml.transforms import tft
except ImportError:
  raise unittest.SkipTest('tensorflow_transform is not installed.')

TRAIN_DATA = [
    beam.Row(num=0, text='And God said, Let there be light,'),
    beam.Row(num=2, text='And there was light'),
    beam.Row(num=8, text='And God saw the light, that it was good'),
]

TEST_DATA = [
    beam.Row(num=6, text='And God divided the light from the darkness.'),
]


class MLTransformTest(unittest.TestCase):
  def test_ml_transform(self):
    ml_opts = beam.options.pipeline_options.PipelineOptions(
        pickle_library='cloudpickle', yaml_experimental_features=['ML'])
    with tempfile.TemporaryDirectory() as tempdir:
      with beam.Pipeline(options=ml_opts) as p:
        elements = p | beam.Create(TRAIN_DATA)
        result = elements | YamlTransform(
            f'''
            type: MLTransform
            config:
              write_artifact_location: {tempdir}
              transforms:
                - type: ScaleTo01
                  config:
                    columns: [num]
                - type: ComputeAndApplyVocabulary
                  config:
                    columns: [text]
                    split_string_by_delimiter: ' ,.'
            ''')
        assert_that(
            # Why is this an array, not a scalar?
            result | beam.Map(lambda x: x.num[0]),
            equal_to([0, .25, 1]))
        assert_that(
            result | beam.Map(lambda x: set(x.text))
            | beam.CombineGlobally(lambda xs: set.union(*xs)),
            equal_to([set(range(13))]),
            label='CheckVocab')

      with beam.Pipeline(options=ml_opts) as p:
        elements = p | beam.Create(TEST_DATA)
        result = elements | YamlTransform(
            f'''
            type: MLTransform
            config:
              read_artifact_location: {tempdir}
            ''')
        assert_that(result | beam.Map(lambda x: x.num[0]), equal_to([.75]))
        assert_that(
            result | beam.Map(lambda x: len(set(x.text))),
            equal_to([5]),
            label='CheckVocab')

  def test_ml_transform_read_with_map_to_fields(self):
    ml_opts = beam.options.pipeline_options.PipelineOptions(
        pickle_library='cloudpickle', yaml_experimental_features=['ML'])
    with tempfile.TemporaryDirectory() as tempdir:
      # First, write the artifacts.
      with beam.Pipeline(options=ml_opts) as p:
        elements = p | beam.Create(TRAIN_DATA)
        _ = elements | YamlTransform(
            f'''
            type: MLTransform
            config:
              write_artifact_location: {tempdir}
              transforms:
                - type: ScaleTo01
                  config:
                    columns: [num]
                - type: ComputeAndApplyVocabulary
                  config:
                    columns: [text]
                    split_string_by_delimiter: ' ,.'
            ''')

      # Now, read the artifacts and use MapToFields.
      with beam.Pipeline(options=ml_opts) as p:
        elements = p | beam.Create(TEST_DATA)
        result = elements | YamlTransform(
            f'''
            type: chain
            transforms:
              - type: MLTransform
                config:
                  read_artifact_location: {tempdir}
              - type: MapToFields
                config:
                  language: python
                  fields:
                    num_scaled: "num[0]"
                    text_vocab: text
            ''')

        def check_row(row):
          assert row.num_scaled == 0.75
          assert len(set(row.text_vocab)) == 5
          return row.num_scaled

        assert_that(result | beam.Map(check_row), equal_to([0.75]))

  def test_sentence_transformer_embedding(self):
    SENTENCE_EMBEDDING_DIMENSION = 384
    DATA = [{
        'id': 1, 'log_message': "Error in module A"
    }, {
        'id': 2, 'log_message': "Warning in module B"
    }, {
        'id': 3, 'log_message': "Info in module C"
    }]
    ml_opts = beam.options.pipeline_options.PipelineOptions(
        pickle_library='cloudpickle', yaml_experimental_features=['ML'])
    with tempfile.TemporaryDirectory() as tempdir:
      with beam.Pipeline(options=ml_opts) as p:
        elements = p | beam.Create(DATA)
        result = elements | YamlTransform(
            f'''
            type: MLTransform
            config:
              write_artifact_location: {tempdir}
              transforms:
                - type: SentenceTransformerEmbeddings
                  config:
                    model_name: all-MiniLM-L6-v2
                    columns: [log_message]
            ''')

        # Perform a basic check to ensure that embeddings are generated
        # and that the dimension of those embeddings is correct.
        actual_output = result | beam.Map(lambda x: len(x['log_message']))
        assert_that(
            actual_output, equal_to([SENTENCE_EMBEDDING_DIMENSION] * len(DATA)))

  def test_sentence_transformer_embedding_with_beam_rows(self):
    SENTENCE_EMBEDDING_DIMENSION = 384
    DATA = [
        beam.Row(id=1, log_message="Error in module A"),
        beam.Row(id=2, log_message="Warning in module B"),
        beam.Row(id=3, log_message="Info in module C"),
    ]
    ml_opts = beam.options.pipeline_options.PipelineOptions(
        pickle_library='cloudpickle', yaml_experimental_features=['ML'])
    with tempfile.TemporaryDirectory() as tempdir:
      with beam.Pipeline(options=ml_opts) as p:
        elements = p | beam.Create(DATA)
        result = elements | YamlTransform(
            f'''
            type: MLTransform
            config:
              write_artifact_location: {tempdir}
              transforms:
                - type: SentenceTransformerEmbeddings
                  config:
                    model_name: all-MiniLM-L6-v2
                    columns: [log_message]
            ''')

        # Perform a basic check to ensure that embeddings are generated
        # and that the dimension of those embeddings is correct.
        actual_output = result | beam.Map(lambda x: len(x.log_message))
        assert_that(
            actual_output, equal_to([SENTENCE_EMBEDDING_DIMENSION] * len(DATA)))

  def test_ml_transform_outputs_schema(self):
    SENTENCE_EMBEDDING_DIMENSION = 384
    ml_opts = beam.options.pipeline_options.PipelineOptions(
        pickle_library='cloudpickle', yaml_experimental_features=['ML'])
    with tempfile.TemporaryDirectory() as tempdir:
      with beam.Pipeline(options=ml_opts) as p:
        result = p | YamlTransform(
            f'''
            type: chain
            transforms:
              - type: Create
                config:
                  elements:
                    - {{id: 1, log_message: "Error in module A"}}
                    - {{id: 2, log_message: "Warning in module B"}}
                    - {{id: 3, log_message: "Info in module C"}}
              - type: MLTransform
                config:
                  write_artifact_location: {tempdir}
                  transforms:
                    - type: SentenceTransformerEmbeddings
                      config:
                        model_name: all-MiniLM-L6-v2
                        columns: [log_message]
              - type: MapToFields
                config:
                  language: python
                  fields:
                    id: id
                    embedding: log_message
            ''')

        def check_row(row):
          assert isinstance(row.id, int)
          assert isinstance(row.embedding, list)
          assert len(row.embedding) == SENTENCE_EMBEDDING_DIMENSION
          return row.id

        assert_that(result | beam.Map(check_row), equal_to([1, 2, 3]))


if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  unittest.main()