from typing import Any

from apadata.modules.clearbit_api_module import ClearbitAPIModule
from apadata.modules.elasticsearch_client_module import ElasticsearchClientModule
from apadata.modules.lang_detect_text_processor_module import (
    LangDetectTextProcessorModule,
)
from apadata.modules.sitemap_api_module import SitemapAPIModule
from apadata.modules.spacy_text_processor_module import SpacyTextProcessorModule
from apadata.modules.target_industries_mentions_module import (
    TargetIndustriesMentionsModule,
)
from apadata.pipelines.pipeline import Pipeline
from apadata.pipelines.pipeline_context import PipelineContext
from apadata.utils import flatten


class TargetIndustriesExtractionPipeline(Pipeline):
    """
    Class that performs the whole target industries extraction pipeline

    Parameters
    ----------
    context: PipelineContext
        Context of the pipeline which receives and passes along data across a
        pipeline enriching it with information from several modules
    """

    def __init__(
        self,
        context: PipelineContext,
    ):
        super().__init__(
            context=context,
            modules=[
                SitemapAPIModule(),
                ElasticsearchClientModule(),
                LangDetectTextProcessorModule(),
                SpacyTextProcessorModule(),
                ClearbitAPIModule(),
                TargetIndustriesMentionsModule(),
            ],
        )

    def post_process(self, context: PipelineContext) -> Any:
        return {
            "target_industries": list(set(flatten(context.get("target_industries"))))
        }
